library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-3
library(ROCR)
library(MASS)
library(ggplot2)
library(pheatmap)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ purrr 0.3.4
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks randomForest::combine()
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x randomForest::margin() masks ggplot2::margin()
## x tidyr::pack() masks Matrix::pack()
## x dplyr::select() masks MASS::select()
## x tidyr::unpack() masks Matrix::unpack()
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(ggthemes)
library(vcd)
#install.packages("gtsummary")
library(gtsummary)
##
## Attaching package: 'gtsummary'
## The following object is masked from 'package:MASS':
##
## select
library(Hmisc)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
##
## coords
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Load Theme for plots
theme_set(theme_fivethirtyeight())
theme_update(axis.title = element_text()) #the default for fivethirtyeight is to not show axis labels, this removes that default so we can choose to specify and display axis titles
theme_update(plot.title = element_text(hjust = 0.5)) # changing default to center all titles
Load Data downloaded from UCI and stored on github repo https://archive.ics.uci.edu/ml/datasets/Adult
adult = read.csv("https://raw.githubusercontent.com/rickfontenot/Predicting_Income/main/data/adult.data", header = FALSE)
Description of variables from UCI:
Response: >50K, <=50K.
age: continuous. workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. fnlwgt: continuous. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. education-num: continuous. marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. sex: Female, Male. capital-gain: continuous. capital-loss: continuous. hours-per-week: continuous. native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
Add column names to data set:
# NOTE: names using underscore instead of hyphen so they can be referenced easier later
colnames(adult) <- c("age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income")
Investigate NA values to determine what needs resolution
#Replace "?" with NA and re-do missing value analysis
missing_values <- adult
missing_values[, 1:14][missing_values[, 1:14] == " ?"] <- NA
aggr_plot <- aggr(missing_values, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(adult), cex.axis=.7, gap=3, ylab=c("Percent data missing","Combinations Missing"), prop=FALSE,cex.numbers=0.8)
##
## Variables sorted by number of missings:
## Variable Count
## occupation 1843
## workclass 1836
## native_country 583
## age 0
## fnlwgt 0
## education 0
## education_num 0
## marital_status 0
## relationship 0
## race 0
## sex 0
## capital_gain 0
## capital_loss 0
## hours_per_week 0
## income 0
#occupation missing 5.66% of values
#workclass missing 5.64% of values
#native-country missing 1.79& of values
#Note that half of the missing workclass values occur on observations that are also missing occupation
Examine formats of data available
str(adult)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr " State-gov" " Self-emp-not-inc" " Private" " Private" ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : chr " Bachelors" " Bachelors" " HS-grad" " 11th" ...
## $ education_num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital_status: chr " Never-married" " Married-civ-spouse" " Divorced" " Married-civ-spouse" ...
## $ occupation : chr " Adm-clerical" " Exec-managerial" " Handlers-cleaners" " Handlers-cleaners" ...
## $ relationship : chr " Not-in-family" " Husband" " Not-in-family" " Husband" ...
## $ race : chr " White" " White" " White" " Black" ...
## $ sex : chr " Male" " Male" " Male" " Male" ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native_country: chr " United-States" " United-States" " United-States" " United-States" ...
## $ income : chr " <=50K" " <=50K" " <=50K" " <=50K" ...
adult[, 1:14][adult[, 1:14] == " ?"] <- "no_response"
#Convert character vars to factors and make list of vars
adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$occupation <- as.factor(adult$occupation)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
categorical.explanatory = c("workclass","education","marital_status","occupation","relationship","race","sex","native_country")
str(adult)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 9 levels " Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ education_num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital_status: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 15 levels " Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native_country: Factor w/ 42 levels " Cambodia"," Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...
Summary Statistics for Categorical variables & Distributions for Numerical variables
categorical <- adult %>% select(categorical.explanatory, income)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(categorical.explanatory)` instead of `categorical.explanatory` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
categorical %>% tbl_summary()
| Characteristic | N = 32,5611 |
|---|---|
| workclass | |
| Federal-gov | 960 (2.9%) |
| Local-gov | 2,093 (6.4%) |
| Never-worked | 7 (<0.1%) |
| Private | 22,696 (70%) |
| Self-emp-inc | 1,116 (3.4%) |
| Self-emp-not-inc | 2,541 (7.8%) |
| State-gov | 1,298 (4.0%) |
| Without-pay | 14 (<0.1%) |
| no_response | 1,836 (5.6%) |
| education | |
| 10th | 933 (2.9%) |
| 11th | 1,175 (3.6%) |
| 12th | 433 (1.3%) |
| 1st-4th | 168 (0.5%) |
| 5th-6th | 333 (1.0%) |
| 7th-8th | 646 (2.0%) |
| 9th | 514 (1.6%) |
| Assoc-acdm | 1,067 (3.3%) |
| Assoc-voc | 1,382 (4.2%) |
| Bachelors | 5,355 (16%) |
| Doctorate | 413 (1.3%) |
| HS-grad | 10,501 (32%) |
| Masters | 1,723 (5.3%) |
| Preschool | 51 (0.2%) |
| Prof-school | 576 (1.8%) |
| Some-college | 7,291 (22%) |
| marital_status | |
| Divorced | 4,443 (14%) |
| Married-AF-spouse | 23 (<0.1%) |
| Married-civ-spouse | 14,976 (46%) |
| Married-spouse-absent | 418 (1.3%) |
| Never-married | 10,683 (33%) |
| Separated | 1,025 (3.1%) |
| Widowed | 993 (3.0%) |
| occupation | |
| Adm-clerical | 3,770 (12%) |
| Armed-Forces | 9 (<0.1%) |
| Craft-repair | 4,099 (13%) |
| Exec-managerial | 4,066 (12%) |
| Farming-fishing | 994 (3.1%) |
| Handlers-cleaners | 1,370 (4.2%) |
| Machine-op-inspct | 2,002 (6.1%) |
| Other-service | 3,295 (10%) |
| Priv-house-serv | 149 (0.5%) |
| Prof-specialty | 4,140 (13%) |
| Protective-serv | 649 (2.0%) |
| Sales | 3,650 (11%) |
| Tech-support | 928 (2.9%) |
| Transport-moving | 1,597 (4.9%) |
| no_response | 1,843 (5.7%) |
| relationship | |
| Husband | 13,193 (41%) |
| Not-in-family | 8,305 (26%) |
| Other-relative | 981 (3.0%) |
| Own-child | 5,068 (16%) |
| Unmarried | 3,446 (11%) |
| Wife | 1,568 (4.8%) |
| race | |
| Amer-Indian-Eskimo | 311 (1.0%) |
| Asian-Pac-Islander | 1,039 (3.2%) |
| Black | 3,124 (9.6%) |
| Other | 271 (0.8%) |
| White | 27,816 (85%) |
| sex | |
| Female | 10,771 (33%) |
| Male | 21,790 (67%) |
| native_country | |
| Cambodia | 19 (<0.1%) |
| Canada | 121 (0.4%) |
| China | 75 (0.2%) |
| Columbia | 59 (0.2%) |
| Cuba | 95 (0.3%) |
| Dominican-Republic | 70 (0.2%) |
| Ecuador | 28 (<0.1%) |
| El-Salvador | 106 (0.3%) |
| England | 90 (0.3%) |
| France | 29 (<0.1%) |
| Germany | 137 (0.4%) |
| Greece | 29 (<0.1%) |
| Guatemala | 64 (0.2%) |
| Haiti | 44 (0.1%) |
| Holand-Netherlands | 1 (<0.1%) |
| Honduras | 13 (<0.1%) |
| Hong | 20 (<0.1%) |
| Hungary | 13 (<0.1%) |
| India | 100 (0.3%) |
| Iran | 43 (0.1%) |
| Ireland | 24 (<0.1%) |
| Italy | 73 (0.2%) |
| Jamaica | 81 (0.2%) |
| Japan | 62 (0.2%) |
| Laos | 18 (<0.1%) |
| Mexico | 643 (2.0%) |
| Nicaragua | 34 (0.1%) |
| Outlying-US(Guam-USVI-etc) | 14 (<0.1%) |
| Peru | 31 (<0.1%) |
| Philippines | 198 (0.6%) |
| Poland | 60 (0.2%) |
| Portugal | 37 (0.1%) |
| Puerto-Rico | 114 (0.4%) |
| Scotland | 12 (<0.1%) |
| South | 80 (0.2%) |
| Taiwan | 51 (0.2%) |
| Thailand | 18 (<0.1%) |
| Trinadad&Tobago | 19 (<0.1%) |
| United-States | 29,170 (90%) |
| Vietnam | 67 (0.2%) |
| Yugoslavia | 16 (<0.1%) |
| no_response | 583 (1.8%) |
| income | |
| <=50K | 24,720 (76%) |
| >50K | 7,841 (24%) |
| 1 n (%) | |
hist.data.frame(adult %>% select(-categorical.explanatory,-income),nclass=20)
Check initial simple logistic regression with no cleaning or additional features (Dropped all NA, dropped fnlwgt, no imputing of missing values) *Note occupation and native_country not significant
AIC: 16561 Accuracy : 0.8188 Sensitivity : 0.9369
Specificity : 0.4584 **Poor Specificity
logit.set <- adult %>% select(-fnlwgt)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
logit.set$native_country <- as.numeric(logit.set$native_country)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 30 36 20 38 30 34 49 29 42 33 ...
## $ workclass : num 4 2 7 4 2 4 1 4 4 4 ...
## $ education : num 16 9 16 13 16 12 10 12 16 2 ...
## $ education_num : int 10 11 10 14 10 9 13 9 10 7 ...
## $ marital_status: num 3 5 5 3 5 5 3 3 6 5 ...
## $ occupation : num 12 10 1 10 11 8 4 7 4 12 ...
## $ relationship : num 6 2 4 1 2 2 1 1 2 5 ...
## $ race : num 5 5 5 5 3 5 5 5 5 3 ...
## $ sex : num 1 1 1 2 2 1 2 2 2 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 1974 0 0 0 0 0 ...
## $ hours_per_week: int 40 40 40 50 40 28 40 40 60 17 ...
## $ native_country: num 39 39 39 39 39 39 39 39 2 39 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4026 -0.6174 -0.3362 -0.0841 3.4193
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.480e+00 2.786e-01 -30.440 < 2e-16 ***
## age 3.513e-02 1.633e-03 21.512 < 2e-16 ***
## workclass -1.195e-01 1.320e-02 -9.053 < 2e-16 ***
## education 1.838e-02 6.266e-03 2.934 0.00335 **
## education_num 3.354e-01 8.643e-03 38.804 < 2e-16 ***
## marital_status -2.222e-01 1.470e-02 -15.112 < 2e-16 ***
## occupation -2.943e-03 4.650e-03 -0.633 0.52673
## relationship -1.203e-01 1.721e-02 -6.987 2.81e-12 ***
## race 1.118e-01 2.549e-02 4.386 1.15e-05 ***
## sex 9.673e-01 6.074e-02 15.925 < 2e-16 ***
## capital_gain 3.287e-04 1.200e-05 27.397 < 2e-16 ***
## capital_loss 6.738e-04 4.025e-05 16.737 < 2e-16 ***
## hours_per_week 2.902e-02 1.665e-03 17.423 < 2e-16 ***
## native_country -2.012e-03 3.287e-03 -0.612 0.54041
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25139 on 22792 degrees of freedom
## Residual deviance: 17298 on 22779 degrees of freedom
## AIC: 17326
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6987 1333
## 1 418 1030
##
## Accuracy : 0.8207
## 95% CI : (0.813, 0.8283)
## No Information Rate : 0.7581
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4371
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9436
## Specificity : 0.4359
## Pos Pred Value : 0.8398
## Neg Pred Value : 0.7113
## Prevalence : 0.7581
## Detection Rate : 0.7153
## Detection Prevalence : 0.8518
## Balanced Accuracy : 0.6897
##
## 'Positive' Class : 0
##
Explore Categorical Varaibles vs. Income
workclass: reducing levels into work_sector improves model performance and simplifies interpretability so choosing to replace workclass
AIC: 16511 vs original 16561 –>improved Accuracy : 0.8201 vs original 0.8188 –>improved Sensitivity : 0.9395 vs original 0.9369 –> improved
Specificity : 0.4571 vs original 0.4584 –>dropped just a little
workclass = table(adult$income, adult$workclass)
mosaicplot(workclass, shade = TRUE, las=2, main = "workclass", pop = FALSE)
## Warning: In mosaicplot.default(workclass, shade = TRUE, las = 2, main = "workclass",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
workclasschisq <- chisq.test(workclass)
## Warning in chisq.test(workclass): Chi-squared approximation may be incorrect
workclasschisq
##
## Pearson's Chi-squared test
##
## data: workclass
## X-squared = 1045.7, df = 8, p-value < 2.2e-16
#X-squared = 827.72 , p-value=2.2e-16
#Evaluate reducing levels to Government, Private, Self-Employed, Other
adult$workclass <- trimws(adult$workclass)
adult$work_sector <- "Other"
adult$work_sector[adult$workclass %in% c("Federal-gov","Local-gov","State-gov")] <- "Government"
adult$work_sector[adult$workclass %in% c("Private")] <- "Private"
adult$work_sector[adult$workclass %in% c("Self-emp-inc","Self-emp-not-inc")] <- "Self_Employed"
adult$work_sector[adult$workclass %in% c("Never-worked","Without-pay")] <- "Not_Working"
adult$work_sector = as.factor(adult$work_sector)
work_sector = table(adult$income, adult$work_sector)
mosaicplot(work_sector, shade = TRUE, las=2, main = "work_sector", pop = FALSE)
## Warning: In mosaicplot.default(work_sector, shade = TRUE, las = 2, main = "work_sector",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
work_sectorchisq <- chisq.test(work_sector)
work_sectorchisq
##
## Pearson's Chi-squared test
##
## data: work_sector
## X-squared = 687.39, df = 4, p-value < 2.2e-16
#X-squared = 565.28 , p-value=2.2e-16
#Since X-Square decreased the difference between levels is not quite as strong but does simplify
#Re-run regression with work_sector to compare to workclass
logit.set <- adult %>% select(-fnlwgt,-workclass)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$work_sector <- as.numeric(logit.set$work_sector)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
logit.set$native_country <- as.numeric(logit.set$native_country)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 38 53 20 19 40 33 54 37 42 58 ...
## $ education : num 8 11 12 16 16 8 8 12 12 1 ...
## $ education_num : int 12 16 9 10 10 12 12 9 9 6 ...
## $ marital_status: num 3 3 5 5 1 5 3 3 6 1 ...
## $ occupation : num 4 10 14 15 4 12 3 6 3 14 ...
## $ relationship : num 6 1 3 4 2 2 1 1 5 2 ...
## $ race : num 5 5 5 5 5 4 5 5 3 3 ...
## $ sex : num 1 2 2 1 2 2 2 2 1 2 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 1902 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 40 40 40 35 40 36 40 24 40 50 ...
## $ native_country: num 39 39 39 39 39 39 5 39 39 39 ...
## $ work_sector : num 4 5 4 3 4 4 4 4 4 4 ...
## $ income.binary : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3722 -0.6220 -0.3401 -0.0896 3.3930
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.656e+00 2.795e-01 -30.974 < 2e-16 ***
## age 3.408e-02 1.626e-03 20.962 < 2e-16 ***
## education 1.235e-02 6.197e-03 1.993 0.0462 *
## education_num 3.336e-01 8.683e-03 38.427 < 2e-16 ***
## marital_status -2.322e-01 1.474e-02 -15.756 < 2e-16 ***
## occupation -8.202e-03 4.505e-03 -1.821 0.0686 .
## relationship -1.224e-01 1.709e-02 -7.164 7.86e-13 ***
## race 1.101e-01 2.516e-02 4.376 1.21e-05 ***
## sex 8.662e-01 6.038e-02 14.345 < 2e-16 ***
## capital_gain 3.206e-04 1.167e-05 27.475 < 2e-16 ***
## capital_loss 6.762e-04 4.050e-05 16.698 < 2e-16 ***
## hours_per_week 3.107e-02 1.674e-03 18.564 < 2e-16 ***
## native_country 1.671e-04 3.302e-03 0.051 0.9596
## work_sector -2.383e-02 1.598e-02 -1.491 0.1359
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25171 on 22792 degrees of freedom
## Residual deviance: 17433 on 22779 degrees of freedom
## AIC: 17461
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6972 1296
## 1 447 1053
##
## Accuracy : 0.8216
## 95% CI : (0.8138, 0.8291)
## No Information Rate : 0.7595
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4427
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9397
## Specificity : 0.4483
## Pos Pred Value : 0.8433
## Neg Pred Value : 0.7020
## Prevalence : 0.7595
## Detection Rate : 0.7138
## Detection Prevalence : 0.8464
## Balanced Accuracy : 0.6940
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
education
education = table(adult$income, adult$education)
mosaicplot(education, shade = TRUE, las=2, main = "education", pop = FALSE)
## Warning: In mosaicplot.default(education, shade = TRUE, las = 2, main = "education",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
educationchisq <- chisq.test(education)
educationchisq
##
## Pearson's Chi-squared test
##
## data: education
## X-squared = 4429.7, df = 15, p-value < 2.2e-16
#X-squared = 4429.7 , p-value=2.2e-16
#Hard to read since factor levels not in proper order
adult$education <- trimws(adult$education)
adult %>% mutate(education = fct_reorder(education, education_num)) %>% ggplot(aes(x=education,y=education_num)) +
geom_boxplot()+ labs(title= "education vs education_num" , x = "Level", y= "Number") +
theme(axis.text.x=element_text(angle=45,hjust=1))
#Note these two columns are the same info, which is better categorical or numerical? Based on initial logistic regression the numeric has lower p-value and more significance
marital_status: reducing levels into marriage_status improves model performance and simplifies interpretability so choosing to replace marital_status
AIC: 14470 vs original 16561 –>improved Accuracy : 0.832 vs original 0.8188 –>improved Sensitivity : 0.9237 vs original 0.9369 –> dropped a little
Specificity : 0.5656 vs original 0.4584 –>improved
marital_status = table(adult$income, adult$marital_status)
mosaicplot(marital_status, shade = TRUE, las=2, main = "marital_status", pop = FALSE)
## Warning: In mosaicplot.default(marital_status, shade = TRUE, las = 2, main = "marital_status",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
marital_statuschisq <- chisq.test(marital_status)
marital_statuschisq
##
## Pearson's Chi-squared test
##
## data: marital_status
## X-squared = 6517.7, df = 6, p-value < 2.2e-16
#X-squared = 6518 , p-value=2.2e-16
#Evaluate reducing levels to Married, Single, Previously-Married
adult$marital_status <- trimws(adult$marital_status)
adult$marriage_status <- "Other"
adult$marriage_status[adult$marital_status %in% c("Married-AF-spouse","Married-civ-spouse")] <- "Married"
adult$marriage_status[adult$marital_status %in% c("Divorced","Married-spouse-absent","Separated","Widowed")] <- "Previously-Married"
adult$marriage_status[adult$marital_status %in% c("Never-married")] <- "Single"
adult$marriage_status = as.factor(adult$marriage_status)
marriage_status = table(adult$income, adult$marriage_status)
mosaicplot(marriage_status, shade = TRUE, las=2, main = "marriage_status", pop = FALSE)
## Warning: In mosaicplot.default(marriage_status, shade = TRUE, las = 2, main = "marriage_status",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
marriage_statuschisq <- chisq.test(marriage_status)
marriage_statuschisq
##
## Pearson's Chi-squared test
##
## data: marriage_status
## X-squared = 6509.5, df = 2, p-value < 2.2e-16
#X-squared = 6509 , p-value=2.2e-16
#Only slight decrease in X-squared much easier to interpret
#Relationship may be correlated
#Do marital status + gender = relationship status?
marriage_vs_relationship = table(adult$relationship, adult$marriage_status)
marriage_vs_relationship
##
## Married Previously-Married Single
## Husband 13193 0 0
## Not-in-family 17 3582 4706
## Other-relative 125 245 611
## Own-child 96 487 4485
## Unmarried 0 2565 881
## Wife 1568 0 0
chisq.test(marriage_vs_relationship)
##
## Pearson's Chi-squared test
##
## data: marriage_vs_relationship
## X-squared = 38455, df = 10, p-value < 2.2e-16
#Re-run regression with marriage_status to compare to workclass
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marital_status)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marriage_status <- as.numeric(logit.set$marriage_status)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
logit.set$native_country <- as.numeric(logit.set$native_country)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 49 47 56 19 27 44 36 22 37 50 ...
## $ workclass : num 2 1 5 5 5 7 2 5 5 5 ...
## $ education : num 13 12 16 16 12 15 12 9 12 16 ...
## $ education_num : int 14 9 10 10 9 15 9 11 9 10 ...
## $ occupation : num 10 14 12 13 3 10 1 3 14 4 ...
## $ relationship : num 1 1 1 4 4 1 2 4 2 1 ...
## $ race : num 5 3 5 5 5 5 5 5 3 5 ...
## $ sex : num 2 2 2 1 1 2 2 2 2 2 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 99999 ...
## $ capital_loss : int 1902 0 1848 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 40 40 50 40 45 40 40 40 50 50 ...
## $ native_country : num 39 39 39 42 39 39 39 39 39 39 ...
## $ marriage_status: num 1 1 1 3 3 1 3 3 3 1 ...
## $ income.binary : Factor w/ 2 levels "0","1": 2 2 2 1 1 2 1 1 1 2 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5326 -0.5620 -0.2168 -0.0477 3.6207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.810e+00 3.121e-01 -21.819 < 2e-16 ***
## age 2.231e-02 1.740e-03 12.821 < 2e-16 ***
## workclass -8.166e-02 1.385e-02 -5.897 3.69e-09 ***
## education 3.840e-03 6.501e-03 0.591 0.554740
## education_num 3.688e-01 9.283e-03 39.727 < 2e-16 ***
## occupation -9.465e-03 4.773e-03 -1.983 0.047374 *
## relationship 1.173e-01 2.016e-02 5.816 6.03e-09 ***
## race 9.536e-02 2.654e-02 3.593 0.000326 ***
## sex 6.750e-01 8.169e-02 8.263 < 2e-16 ***
## capital_gain 3.276e-04 1.213e-05 27.019 < 2e-16 ***
## capital_loss 6.980e-04 4.322e-05 16.149 < 2e-16 ***
## hours_per_week 3.207e-02 1.793e-03 17.889 < 2e-16 ***
## native_country 3.318e-03 3.516e-03 0.944 0.345347
## marriage_status -1.548e+00 3.734e-02 -41.459 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25217 on 22792 degrees of freedom
## Residual deviance: 15308 on 22779 degrees of freedom
## AIC: 15336
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6889 1033
## 1 550 1296
##
## Accuracy : 0.8379
## 95% CI : (0.8305, 0.8452)
## No Information Rate : 0.7616
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5195
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9261
## Specificity : 0.5565
## Pos Pred Value : 0.8696
## Neg Pred Value : 0.7021
## Prevalence : 0.7616
## Detection Rate : 0.7053
## Detection Prevalence : 0.8110
## Balanced Accuracy : 0.7413
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
occupation: reducing levels into collar improves model performance and simplifies interpretability so choosing to replace occupation which was not significant in original model anyways
AIC: 16362 vs original 16561 –>improved Accuracy : 0.8248 vs original 0.8188 –>improved Sensitivity : 0.9385 vs original 0.9369 –> improved
Specificity : 0.4724 vs original 0.4584 –>improved
occupation = table(adult$income, adult$occupation)
mosaicplot(occupation, shade = TRUE, las=2, main = "occupation", pop = FALSE)
## Warning: In mosaicplot.default(occupation, shade = TRUE, las = 2, main = "occupation",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
occupationchisq <- chisq.test(occupation)
## Warning in chisq.test(occupation): Chi-squared approximation may be incorrect
occupationchisq
##
## Pearson's Chi-squared test
##
## data: occupation
## X-squared = 4032, df = 14, p-value < 2.2e-16
#X-squared = 3745 , p-value = 2,2e-16
occupation_vs_workclass = table(adult$occupation, adult$workclass)
occupation_vs_workclass
##
## Federal-gov Local-gov Never-worked no_response Private
## Adm-clerical 317 283 0 0 2833
## Armed-Forces 9 0 0 0 0
## Craft-repair 64 146 0 0 3195
## Exec-managerial 180 214 0 0 2691
## Farming-fishing 8 29 0 0 455
## Handlers-cleaners 23 47 0 0 1273
## Machine-op-inspct 14 12 0 0 1913
## Other-service 35 193 0 0 2740
## Priv-house-serv 0 0 0 0 149
## Prof-specialty 175 705 0 0 2313
## Protective-serv 28 304 0 0 190
## Sales 14 7 0 0 2942
## Tech-support 68 38 0 0 736
## Transport-moving 25 115 0 0 1266
## no_response 0 0 7 1836 0
##
## Self-emp-inc Self-emp-not-inc State-gov Without-pay
## Adm-clerical 31 50 253 3
## Armed-Forces 0 0 0 0
## Craft-repair 106 531 56 1
## Exec-managerial 400 392 189 0
## Farming-fishing 51 430 15 6
## Handlers-cleaners 2 15 9 1
## Machine-op-inspct 13 36 13 1
## Other-service 27 175 124 1
## Priv-house-serv 0 0 0 0
## Prof-specialty 160 373 414 0
## Protective-serv 5 6 116 0
## Sales 291 385 11 0
## Tech-support 3 26 57 0
## Transport-moving 27 122 41 1
## no_response 0 0 0 0
chisq.test(occupation_vs_workclass)
## Warning in chisq.test(occupation_vs_workclass): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: occupation_vs_workclass
## X-squared = 41677, df = 112, p-value < 2.2e-16
#Evaluate reducing levels to Government, Private, Self-Employed, Other
adult$occupation <- trimws(adult$occupation)
adult$collar <- "Other"
adult$collar[adult$occupation %in% c("Adm-clerical")] <- "White-support"
adult$collar[adult$occupation %in% c("Exec-managerial","Prof-specialty","Protective-serv","Sales","Tech-support
")] <- "White"
adult$collar[adult$occupation %in% c("Armed-Forces
","Craft-repair","Farming-fishing","Handlers-cleaners","Machine-op-inspct","Other-service","Priv-house-serv","Transport-moving")] <- "Blue"
adult$collar = as.factor(adult$collar)
collar = table(adult$income, adult$collar)
mosaicplot(collar, shade = TRUE, las=2, main = "Collar (Occupation Group)", pop = FALSE)
## Warning: In mosaicplot.default(collar, shade = TRUE, las = 2, main = "Collar (Occupation Group)",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
collarchisq <- chisq.test(collar)
collarchisq
##
## Pearson's Chi-squared test
##
## data: collar
## X-squared = 2884.3, df = 3, p-value < 2.2e-16
#X-squared = 2884 , p-value = 2,2e-16
# **Reduced X-squared but still significant check regression effect
#Re-run regression with collar to compare to workclass
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-occupation)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$collar <- as.numeric(logit.set$collar)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
logit.set$native_country <- as.numeric(logit.set$native_country)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 46 44 25 23 31 44 49 56 31 38 ...
## $ workclass : num 5 5 5 5 8 5 5 1 8 5 ...
## $ education : num 10 16 12 4 3 16 1 16 10 12 ...
## $ education_num : int 13 10 9 2 8 10 6 10 13 9 ...
## $ marital_status: num 5 3 3 3 3 7 3 3 3 6 ...
## $ relationship : num 2 1 6 6 6 2 1 1 6 5 ...
## $ race : num 5 5 5 1 5 5 5 2 5 3 ...
## $ sex : num 2 2 1 1 1 1 2 2 1 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 1504 0 0 0 0 0 0 0 1902 0 ...
## $ hours_per_week: int 40 50 40 35 40 25 40 40 35 32 ...
## $ native_country: num 39 39 39 26 39 39 39 25 39 39 ...
## $ collar : num 3 1 4 1 1 3 3 4 3 1 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 2 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2070 -0.6121 -0.3388 -0.0918 3.4369
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.008e+00 2.825e-01 -31.882 < 2e-16 ***
## age 3.409e-02 1.617e-03 21.080 < 2e-16 ***
## workclass -5.414e-02 1.300e-02 -4.166 3.10e-05 ***
## education 7.297e-03 6.169e-03 1.183 0.237
## education_num 2.985e-01 9.004e-03 33.150 < 2e-16 ***
## marital_status -2.364e-01 1.469e-02 -16.088 < 2e-16 ***
## relationship -1.119e-01 1.717e-02 -6.516 7.21e-11 ***
## race 1.381e-01 2.599e-02 5.316 1.06e-07 ***
## sex 1.047e+00 6.129e-02 17.085 < 2e-16 ***
## capital_gain 3.114e-04 1.152e-05 27.039 < 2e-16 ***
## capital_loss 6.474e-04 4.103e-05 15.780 < 2e-16 ***
## hours_per_week 3.012e-02 1.676e-03 17.970 < 2e-16 ***
## native_country -1.179e-03 3.294e-03 -0.358 0.721
## collar 2.341e-01 2.003e-02 11.689 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25142 on 22792 degrees of freedom
## Residual deviance: 17349 on 22779 degrees of freedom
## AIC: 17377
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6965 1257
## 1 441 1105
##
## Accuracy : 0.8262
## 95% CI : (0.8185, 0.8336)
## No Information Rate : 0.7582
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4627
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9405
## Specificity : 0.4678
## Pos Pred Value : 0.8471
## Neg Pred Value : 0.7147
## Prevalence : 0.7582
## Detection Rate : 0.7130
## Detection Prevalence : 0.8417
## Balanced Accuracy : 0.7041
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
relationship
relationship = table(adult$income, adult$relationship)
mosaicplot(relationship, shade = TRUE, las=2, main = "relationship", pop = FALSE)
## Warning: In mosaicplot.default(relationship, shade = TRUE, las = 2, main = "relationship",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
relationshipchisq <- chisq.test(relationship)
relationshipchisq
##
## Pearson's Chi-squared test
##
## data: relationship
## X-squared = 6699.1, df = 5, p-value < 2.2e-16
#X-squared = 6699 , p-value = 2,2e-16
#Appears to be correlated with Marriage_status and sex, drop from model and compare
race
race = table(adult$income, adult$race)
mosaicplot(race, shade = TRUE, las=2, main = "race", pop = FALSE)
## Warning: In mosaicplot.default(race, shade = TRUE, las = 2, main = "race",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
racechisq <- chisq.test(race)
racechisq
##
## Pearson's Chi-squared test
##
## data: race
## X-squared = 330.92, df = 4, p-value < 2.2e-16
#X-squared = 331 , p-value = 2,2e-16
race_vs_native_country = table(adult$race, adult$native_country)
race_vs_native_country
##
## Cambodia Canada China Columbia Cuba
## Amer-Indian-Eskimo 0 0 0 1 0
## Asian-Pac-Islander 18 1 73 0 0
## Black 1 0 0 0 3
## Other 0 1 0 7 2
## White 0 119 2 51 90
##
## Dominican-Republic Ecuador El-Salvador England
## Amer-Indian-Eskimo 0 0 0 0
## Asian-Pac-Islander 1 0 0 1
## Black 12 0 1 8
## Other 18 9 4 0
## White 39 19 101 81
##
## France Germany Greece Guatemala Haiti
## Amer-Indian-Eskimo 0 1 0 0 0
## Asian-Pac-Islander 0 3 1 0 1
## Black 1 8 0 0 43
## Other 0 1 0 4 0
## White 28 124 28 60 0
##
## Holand-Netherlands Honduras Hong Hungary India Iran
## Amer-Indian-Eskimo 0 0 1 0 0 0
## Asian-Pac-Islander 0 0 17 0 85 6
## Black 0 1 0 0 2 0
## Other 0 0 0 0 5 2
## White 1 12 2 13 8 35
##
## Ireland Italy Jamaica Japan Laos Mexico Nicaragua
## Amer-Indian-Eskimo 0 0 0 0 0 8 0
## Asian-Pac-Islander 1 0 0 38 18 1 0
## Black 0 0 75 3 0 4 2
## Other 0 0 1 2 0 40 4
## White 23 73 5 19 0 590 28
##
## Outlying-US(Guam-USVI-etc) Peru Philippines Poland
## Amer-Indian-Eskimo 0 0 1 0
## Asian-Pac-Islander 0 0 188 1
## Black 6 0 1 0
## Other 0 1 0 0
## White 8 30 8 59
##
## Portugal Puerto-Rico Scotland South Taiwan Thailand
## Amer-Indian-Eskimo 0 1 0 2 0 0
## Asian-Pac-Islander 1 1 0 77 48 16
## Black 0 9 0 0 0 0
## Other 0 21 0 0 1 0
## White 36 82 12 1 2 2
##
## Trinadad&Tobago United-States Vietnam Yugoslavia
## Amer-Indian-Eskimo 0 296 0 0
## Asian-Pac-Islander 2 292 65 0
## Black 16 2832 0 0
## Other 1 129 0 0
## White 0 25621 2 16
##
## no_response
## Amer-Indian-Eskimo 0
## Asian-Pac-Islander 83
## Black 96
## Other 18
## White 386
chisq.test(race_vs_native_country)
## Warning in chisq.test(race_vs_native_country): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: race_vs_native_country
## X-squared = 21815, df = 164, p-value < 2.2e-16
sex
sex = table(adult$income, adult$sex)
mosaicplot(sex, shade = TRUE, las=2, main = "sex", pop = FALSE)
## Warning: In mosaicplot.default(sex, shade = TRUE, las = 2, main = "sex",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
sexchisq <- chisq.test(sex)
sexchisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: sex
## X-squared = 1517.8, df = 1, p-value < 2.2e-16
#X-squared = 1518 , p-value = 2,2e-16
native_country
#see above this is correlated to race which may be better predictor
native_country = table(adult$income, adult$native_country)
mosaicplot(native_country, shade = TRUE, las=2, main = "race", pop = FALSE)
## Warning: In mosaicplot.default(native_country, shade = TRUE, las = 2, main = "race",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
native_countrychisq <- chisq.test(native_country)
## Warning in chisq.test(native_country): Chi-squared approximation may be
## incorrect
native_countrychisq
##
## Pearson's Chi-squared test
##
## data: native_country
## X-squared = 317.23, df = 41, p-value < 2.2e-16
#X-squared = NaN , p-value = NA
#Reduce to Native Born and Foreign Born then compare model
adult$native_country <- trimws(adult$native_country)
adult$native_born <- "No"
adult$native_born[adult$native_country %in% c("United-States")] <- "Yes"
adult$native_born = as.factor(adult$native_born)
native_born = table(adult$income, adult$native_born)
mosaicplot(native_born, shade = TRUE, las=2, main = "native_born", pop = FALSE)
## Warning: In mosaicplot.default(native_born, shade = TRUE, las = 2, main = "native_born",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
native_bornchisq <- chisq.test(native_born)
native_bornchisq
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: native_born
## X-squared = 38.426, df = 1, p-value = 5.688e-10
#X-squared = 38.426 , p-value = 5.688e-10
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-collar,-native_country)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
logit.set$native_born <- as.numeric(logit.set$native_born)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 64 24 18 45 63 22 70 37 63 23 ...
## $ workclass : num 2 5 5 5 5 5 5 5 5 5 ...
## $ education : num 12 12 16 12 12 16 10 9 12 12 ...
## $ education_num : int 9 9 10 9 9 10 13 11 9 9 ...
## $ marital_status: num 7 5 5 3 5 5 7 3 1 3 ...
## $ occupation : num 9 9 1 13 7 13 11 7 4 3 ...
## $ relationship : num 5 4 4 1 2 4 5 1 2 1 ...
## $ race : num 5 5 5 5 5 5 5 5 5 5 ...
## $ sex : num 1 2 1 2 2 2 1 2 1 2 ...
## $ capital_gain : int 0 0 0 0 2174 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 40 40 15 50 40 24 20 42 40 40 ...
## $ native_born : num 2 1 2 2 2 2 1 2 2 2 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2472 -0.6242 -0.3397 -0.0852 3.4634
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.006e+00 2.828e-01 -31.846 < 2e-16 ***
## age 3.381e-02 1.612e-03 20.978 < 2e-16 ***
## workclass -5.486e-02 1.290e-02 -4.251 2.12e-05 ***
## education 2.165e-02 6.248e-03 3.465 0.000530 ***
## education_num 3.332e-01 8.699e-03 38.302 < 2e-16 ***
## marital_status -2.367e-01 1.463e-02 -16.178 < 2e-16 ***
## occupation -1.732e-04 4.354e-03 -0.040 0.968267
## relationship -1.183e-01 1.708e-02 -6.924 4.39e-12 ***
## race 1.024e-01 2.639e-02 3.878 0.000105 ***
## sex 9.455e-01 6.021e-02 15.703 < 2e-16 ***
## capital_gain 3.067e-04 1.128e-05 27.192 < 2e-16 ***
## capital_loss 6.591e-04 3.949e-05 16.692 < 2e-16 ***
## hours_per_week 2.977e-02 1.651e-03 18.034 < 2e-16 ***
## native_born 1.752e-01 7.207e-02 2.430 0.015083 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25144 on 22792 degrees of freedom
## Residual deviance: 17454 on 22779 degrees of freedom
## AIC: 17482
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7019 1324
## 1 388 1037
##
## Accuracy : 0.8247
## 95% CI : (0.817, 0.8322)
## No Information Rate : 0.7583
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4472
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9476
## Specificity : 0.4392
## Pos Pred Value : 0.8413
## Neg Pred Value : 0.7277
## Prevalence : 0.7583
## Detection Rate : 0.7186
## Detection Prevalence : 0.8541
## Balanced Accuracy : 0.6934
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
#Change to US, Mexico, and other then compare model
adult$native_country <- trimws(adult$native_country)
adult$birthplace <- "Other"
adult$birthplace[adult$native_country %in% c("United-States")] <- "USA"
adult$birthplace[adult$native_country %in% c("Mexico")] <- "Mexico"
adult$birthplace = as.factor(adult$birthplace)
birthplace = table(adult$income, adult$birthplace)
mosaicplot(birthplace, shade = TRUE, las=2, main = "birthplace", pop = FALSE)
## Warning: In mosaicplot.default(birthplace, shade = TRUE, las = 2, main = "birthplace",
## pop = FALSE) :
## extra argument 'pop' will be disregarded
birthplacechisq <- chisq.test(birthplace)
birthplacechisq
##
## Pearson's Chi-squared test
##
## data: birthplace
## X-squared = 131.53, df = 2, p-value < 2.2e-16
#X-squared = 131.53 , p-value = 2.2e-16
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-collar,-native_country,-native_born)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 90 58 28 30 24 33 22 38 34 46 ...
## $ workclass : num 5 7 2 5 5 5 5 5 5 6 ...
## $ education : num 10 10 12 12 16 1 5 12 16 10 ...
## $ education_num : int 13 13 9 9 10 6 3 9 10 13 ...
## $ marital_status: num 3 3 5 3 5 6 5 3 3 3 ...
## $ occupation : num 4 4 1 3 13 6 6 3 6 13 ...
## $ relationship : num 1 1 3 4 4 2 2 1 1 1 ...
## $ race : num 5 5 5 2 5 5 5 5 5 5 ...
## $ sex : num 2 2 2 2 2 2 2 2 2 2 ...
## $ capital_gain : int 0 0 0 3411 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 45 50 20 40 10 40 40 40 50 45 ...
## $ birthplace : num 3 3 3 3 2 3 1 3 3 3 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.5990 -0.6212 -0.3401 -0.0892 3.3216
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.849e+00 2.972e-01 -29.773 < 2e-16 ***
## age 3.444e-02 1.616e-03 21.313 < 2e-16 ***
## workclass -5.296e-02 1.301e-02 -4.071 4.68e-05 ***
## education 1.317e-02 6.189e-03 2.127 0.0334 *
## education_num 3.399e-01 8.654e-03 39.281 < 2e-16 ***
## marital_status -2.257e-01 1.461e-02 -15.449 < 2e-16 ***
## occupation 1.273e-04 4.351e-03 0.029 0.9767
## relationship -1.167e-01 1.705e-02 -6.846 7.61e-12 ***
## race 1.179e-01 2.576e-02 4.577 4.71e-06 ***
## sex 9.435e-01 6.012e-02 15.693 < 2e-16 ***
## capital_gain 3.173e-04 1.179e-05 26.909 < 2e-16 ***
## capital_loss 6.660e-04 3.970e-05 16.774 < 2e-16 ***
## hours_per_week 2.918e-02 1.653e-03 17.652 < 2e-16 ***
## birthplace 3.201e-02 6.185e-02 0.518 0.6047
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25148 on 22792 degrees of freedom
## Residual deviance: 17448 on 22779 degrees of freedom
## AIC: 17476
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6978 1279
## 1 431 1080
##
## Accuracy : 0.8249
## 95% CI : (0.8173, 0.8324)
## No Information Rate : 0.7585
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4554
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9418
## Specificity : 0.4578
## Pos Pred Value : 0.8451
## Neg Pred Value : 0.7148
## Prevalence : 0.7585
## Detection Rate : 0.7144
## Detection Prevalence : 0.8453
## Balanced Accuracy : 0.6998
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-collar,-native_country,-native_born, -birthplace)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 13 variables:
## $ age : int 28 42 54 25 19 54 38 56 43 75 ...
## $ workclass : num 5 5 5 5 5 7 5 5 5 4 ...
## $ education : num 9 10 12 12 12 10 16 12 16 12 ...
## $ education_num : int 11 13 9 9 9 13 10 9 10 9 ...
## $ marital_status: num 5 5 3 5 5 3 3 6 1 2 ...
## $ occupation : num 6 13 15 1 9 13 15 3 11 8 ...
## $ relationship : num 2 5 1 2 4 1 1 5 5 6 ...
## $ race : num 5 5 5 5 5 5 3 5 5 5 ...
## $ sex : num 2 2 2 1 1 2 2 2 1 1 ...
## $ capital_gain : int 0 0 0 0 34095 0 0 0 0 2653 ...
## $ capital_loss : int 0 0 1887 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 45 40 65 48 20 40 50 40 38 14 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3242 -0.6277 -0.3468 -0.0845 3.3748
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.464e+00 2.550e-01 -33.185 < 2e-16 ***
## age 3.333e-02 1.596e-03 20.881 < 2e-16 ***
## workclass -6.036e-02 1.283e-02 -4.706 2.53e-06 ***
## education 1.404e-02 6.134e-03 2.289 0.0221 *
## education_num 3.342e-01 8.524e-03 39.211 < 2e-16 ***
## marital_status -2.393e-01 1.451e-02 -16.488 < 2e-16 ***
## occupation -9.783e-04 4.316e-03 -0.227 0.8207
## relationship -1.033e-01 1.688e-02 -6.123 9.21e-10 ***
## race 1.166e-01 2.484e-02 4.696 2.65e-06 ***
## sex 8.968e-01 5.970e-02 15.023 < 2e-16 ***
## capital_gain 3.174e-04 1.158e-05 27.414 < 2e-16 ***
## capital_loss 6.575e-04 3.995e-05 16.457 < 2e-16 ***
## hours_per_week 2.873e-02 1.645e-03 17.472 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25369 on 22792 degrees of freedom
## Residual deviance: 17679 on 22780 degrees of freedom
## AIC: 17705
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7079 1241
## 1 427 1021
##
## Accuracy : 0.8292
## 95% CI : (0.8216, 0.8367)
## No Information Rate : 0.7684
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4512
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9431
## Specificity : 0.4514
## Pos Pred Value : 0.8508
## Neg Pred Value : 0.7051
## Prevalence : 0.7684
## Detection Rate : 0.7247
## Detection Prevalence : 0.8518
## Balanced Accuracy : 0.6972
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
#Note none of these show as significant in the model, drop all
Explore numerical variables vs income
capital_gain
adult$income.binary<-0
adult$income.binary[adult$income==" >50K"] <- 1
capital_gain <- adult %>% group_by(capital_gain) %>% summarise(income_above = mean(income.binary))
capital_gain %>% ggplot(mapping=aes(y=income_above, x=capital_gain)) + geom_point(size=1.5)+ labs(title="%Income above 50K vs education_num")
#Not a usefull plot need to bin capital gains
capital_gain <- adult %>% mutate(capital_gain_bin = cut(capital_gain, seq(min(capital_gain), max(capital_gain) + 10000, 10000), right = FALSE))
capital_gain_rates <- capital_gain %>% group_by(capital_gain_bin) %>% summarise(n=n(),income_above = mean(income.binary))
capital_gain_rates %>% ggplot(mapping=aes(y=income_above, x=capital_gain_bin)) + geom_point(aes(size=n))+ labs(title="%Income above 50K vs capital_gain_bin")+geom_text(aes(label=n),hjust=-0.5, vjust=0.5)
#Create Bins for yes/no on capital gains
adult$capital_gain_bin <- "No"
adult$capital_gain_bin[adult$capital_gain > 0] <- "Yes"
adult$capital_gain_bin = as.factor(adult$capital_gain_bin)
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-collar,-native_country,-native_born, -birthplace,-capital_gain)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$capital_gain_bin <- as.factor(logit.set$capital_gain_bin)
logit.set$capital_gain_bin <- as.numeric(logit.set$capital_gain_bin)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
str(logit.train)
## 'data.frame': 22793 obs. of 13 variables:
## $ age : int 23 42 51 24 18 21 24 38 47 66 ...
## $ workclass : num 4 5 5 8 5 5 5 5 5 5 ...
## $ education : num 8 12 7 16 12 12 12 16 12 16 ...
## $ education_num : int 12 9 5 10 9 9 9 10 9 10 ...
## $ marital_status : num 5 3 3 3 5 5 3 1 3 3 ...
## $ occupation : num 8 1 5 11 9 3 3 1 13 4 ...
## $ relationship : num 4 6 1 1 2 2 1 5 1 1 ...
## $ race : num 2 5 5 5 5 5 5 5 5 5 ...
## $ sex : num 2 1 2 2 2 2 2 1 2 2 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 16 53 40 40 25 40 60 40 40 45 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## $ capital_gain_bin: num 1 1 1 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7376 -0.6295 -0.3430 -0.0824 3.4573
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.063e+01 2.644e-01 -40.213 < 2e-16 ***
## age 3.379e-02 1.581e-03 21.376 < 2e-16 ***
## workclass -5.677e-02 1.283e-02 -4.424 9.69e-06 ***
## education 1.604e-02 6.093e-03 2.633 0.00846 **
## education_num 3.525e-01 8.498e-03 41.484 < 2e-16 ***
## marital_status -2.269e-01 1.429e-02 -15.875 < 2e-16 ***
## occupation -4.005e-04 4.285e-03 -0.093 0.92553
## relationship -1.180e-01 1.664e-02 -7.092 1.32e-12 ***
## race 1.453e-01 2.480e-02 5.860 4.62e-09 ***
## sex 8.801e-01 5.856e-02 15.029 < 2e-16 ***
## capital_loss 6.786e-04 4.010e-05 16.922 < 2e-16 ***
## hours_per_week 3.128e-02 1.640e-03 19.074 < 2e-16 ***
## capital_gain_bin 1.695e+00 6.001e-02 28.248 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25261 on 22792 degrees of freedom
## Residual deviance: 18001 on 22780 degrees of freedom
## AIC: 18027
##
## Number of Fisher Scoring iterations: 5
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6916 1285
## 1 542 1025
##
## Accuracy : 0.813
## 95% CI : (0.8051, 0.8207)
## No Information Rate : 0.7635
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4174
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9273
## Specificity : 0.4437
## Pos Pred Value : 0.8433
## Neg Pred Value : 0.6541
## Prevalence : 0.7635
## Detection Rate : 0.7080
## Detection Prevalence : 0.8396
## Balanced Accuracy : 0.6855
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
capital_loss
capital_loss <- adult %>% group_by(capital_loss) %>% summarise(income_above = mean(income.binary))
capital_loss %>% ggplot(mapping=aes(y=income_above, x=capital_loss)) + geom_point(size=1.5)+ labs(title="%Income above 50K vs education_num")
#Not a usefull plot need to bin capital gains
capital_loss <- adult %>% mutate(capital_loss_bin = cut(capital_loss, seq(min(capital_loss), max(capital_loss) + 1000, 1000), right = FALSE))
capital_loss_rates <- capital_loss %>% group_by(capital_loss_bin) %>% summarise(n=n(),income_above = mean(income.binary))
capital_loss_rates %>% ggplot(mapping=aes(y=income_above, x=capital_loss_bin)) + geom_point(aes(size=n))+ labs(title="%Income above 50K vs capital_loss_bin")+geom_text(aes(label=n),hjust=-0.5, vjust=0.5)
#Create Bins for yes/no on capital gains
adult$capital_loss_bin <- "No"
adult$capital_loss_bin[adult$capital_loss > 0] <- "Yes"
adult$capital_loss_bin = as.factor(adult$capital_loss_bin)
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-collar,-native_country,-native_born, -birthplace,-capital_gain_bin,-capital_loss)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$capital_loss_bin <- as.factor(logit.set$capital_loss_bin)
logit.set$capital_loss_bin <- as.numeric(logit.set$capital_loss_bin)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 13 variables:
## $ age : int 22 42 55 33 48 35 34 23 19 45 ...
## $ workclass : num 8 8 1 5 6 1 5 5 5 2 ...
## $ education : num 16 12 16 10 9 11 12 12 12 13 ...
## $ education_num : int 10 9 10 13 11 16 9 9 9 14 ...
## $ marital_status : num 5 1 5 5 3 5 3 5 5 1 ...
## $ occupation : num 11 1 1 11 3 11 3 13 15 11 ...
## $ relationship : num 4 2 5 2 1 2 1 4 4 2 ...
## $ race : num 5 3 5 5 5 1 3 5 5 3 ...
## $ sex : num 1 1 2 1 2 1 2 2 2 2 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 2176 0 ...
## $ hours_per_week : int 20 25 40 45 60 60 40 40 45 40 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
## $ capital_loss_bin: num 1 1 1 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4063 -0.6234 -0.3404 -0.0867 3.4292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.822e+00 2.717e-01 -36.146 < 2e-16 ***
## age 3.428e-02 1.602e-03 21.391 < 2e-16 ***
## workclass -5.586e-02 1.300e-02 -4.299 1.72e-05 ***
## education 1.352e-02 6.150e-03 2.198 0.028 *
## education_num 3.357e-01 8.617e-03 38.955 < 2e-16 ***
## marital_status -2.335e-01 1.460e-02 -15.997 < 2e-16 ***
## occupation 2.232e-03 4.337e-03 0.515 0.607
## relationship -1.296e-01 1.710e-02 -7.577 3.54e-14 ***
## race 1.363e-01 2.533e-02 5.382 7.36e-08 ***
## sex 8.822e-01 6.022e-02 14.650 < 2e-16 ***
## capital_gain 3.269e-04 1.177e-05 27.782 < 2e-16 ***
## hours_per_week 3.027e-02 1.653e-03 18.305 < 2e-16 ***
## capital_loss_bin 1.150e+00 7.536e-02 15.260 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25247 on 22792 degrees of freedom
## Residual deviance: 17497 on 22780 degrees of freedom
## AIC: 17523
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6990 1260
## 1 462 1056
##
## Accuracy : 0.8237
## 95% CI : (0.816, 0.8312)
## No Information Rate : 0.7629
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.447
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9380
## Specificity : 0.4560
## Pos Pred Value : 0.8473
## Neg Pred Value : 0.6957
## Prevalence : 0.7629
## Detection Rate : 0.7156
## Detection Prevalence : 0.8446
## Balanced Accuracy : 0.6970
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
Look at binning to just some capital gain or loss vs zero
#Create Bins for yes/no on capital gains
adult$capital_gain_or_loss <- "No"
adult$capital_gain_or_loss[(adult$capital_loss+adult$capital_gain > 0)] <- "Yes"
adult$capital_gain_or_loss = as.factor(adult$capital_gain_or_loss)
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-work_sector,-marriage_status,-collar,-native_country,-native_born, -birthplace,-capital_gain_bin,-capital_loss_bin,-capital_gain,-capital_loss)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$capital_gain_or_loss <- as.factor(logit.set$capital_gain_or_loss)
logit.set$capital_gain_or_loss <- as.numeric(logit.set$capital_gain_or_loss)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
logit.set$education <- as.factor(logit.set$education)
logit.set$education <- as.numeric(logit.set$education)
logit.set$marital_status <- as.factor(logit.set$marital_status)
logit.set$marital_status <- as.numeric(logit.set$marital_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
str(logit.train)
## 'data.frame': 22793 obs. of 12 variables:
## $ age : int 37 26 36 33 60 62 49 45 49 35 ...
## $ workclass : num 7 5 5 5 7 7 2 6 2 5 ...
## $ education : num 12 10 13 12 16 15 13 12 16 9 ...
## $ education_num : int 9 13 14 9 10 15 14 9 10 11 ...
## $ marital_status : num 5 5 5 3 3 3 6 3 3 5 ...
## $ occupation : num 5 9 4 13 4 11 11 3 12 3 ...
## $ relationship : num 2 2 2 1 1 1 5 1 1 4 ...
## $ race : num 5 5 5 5 5 5 5 5 5 3 ...
## $ sex : num 2 2 1 2 2 2 1 2 2 2 ...
## $ hours_per_week : int 86 40 40 60 40 15 70 50 44 40 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 1 ...
## $ capital_gain_or_loss: num 1 1 2 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7699 -0.6337 -0.3464 -0.0814 3.4164
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.330742 0.256451 -40.283 < 2e-16 ***
## age 0.034588 0.001575 21.958 < 2e-16 ***
## workclass -0.060415 0.012761 -4.734 2.20e-06 ***
## education 0.013091 0.006021 2.174 0.0297 *
## education_num 0.346196 0.008426 41.089 < 2e-16 ***
## marital_status -0.226998 0.014199 -15.987 < 2e-16 ***
## occupation 0.002536 0.004248 0.597 0.5505
## relationship -0.115606 0.016455 -7.026 2.13e-12 ***
## race 0.110263 0.023936 4.607 4.10e-06 ***
## sex 0.920630 0.058391 15.767 < 2e-16 ***
## hours_per_week 0.031976 0.001631 19.606 < 2e-16 ***
## capital_gain_or_loss 1.525582 0.049088 31.078 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25324 on 22792 degrees of freedom
## Residual deviance: 18171 on 22781 degrees of freedom
## AIC: 18195
##
## Number of Fisher Scoring iterations: 5
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6924 1259
## 1 562 1023
##
## Accuracy : 0.8136
## 95% CI : (0.8057, 0.8213)
## No Information Rate : 0.7664
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4175
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9249
## Specificity : 0.4483
## Pos Pred Value : 0.8461
## Neg Pred Value : 0.6454
## Prevalence : 0.7664
## Detection Rate : 0.7088
## Detection Prevalence : 0.8377
## Balanced Accuracy : 0.6866
##
## 'Positive' Class : 0
##
#adult <- adult %>% select(-work_sector)
age
adult$income.binary<-0
adult$income.binary[adult$income==" >50K"] <- 1
age_rates <- adult %>% group_by(age) %>% summarise(income_above = mean(income.binary))
age_rates %>% ggplot(mapping=aes(y=income_above, x=age)) + geom_point(size=1.5)+ labs(title="%Income above 50K vs Age")
#Note this relationship is not linear, can it be transformed?
linearModel <- lm(income_above ~ age, data=age_rates)
summary(linearModel)
##
## Call:
## lm(formula = income_above ~ age, data = age_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.232272 -0.119139 0.006878 0.121938 0.211329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2090075 0.0439919 4.751 1.02e-05 ***
## age 0.0002644 0.0007710 0.343 0.733
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.139 on 71 degrees of freedom
## Multiple R-squared: 0.001653, Adjusted R-squared: -0.01241
## F-statistic: 0.1176 on 1 and 71 DF, p-value: 0.7327
#Adj. R2 ~=0 age is not significant
age_rates$age2 = age_rates$age^2
quadraticModel2 <- lm(income_above ~ age + age2, data=age_rates)
summary(quadraticModel2)
##
## Call:
## lm(formula = income_above ~ age + age2, data = age_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.119181 -0.051391 -0.005641 0.036783 0.246704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.921e-01 5.418e-02 -9.083 1.86e-13 ***
## age 3.166e-02 2.243e-03 14.118 < 2e-16 ***
## age2 -2.959e-04 2.081e-05 -14.221 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07097 on 70 degrees of freedom
## Multiple R-squared: 0.7433, Adjusted R-squared: 0.736
## F-statistic: 101.3 on 2 and 70 DF, p-value: < 2.2e-16
#Adj. R2 = 0.726 and second order term is significant
#income = -0.49214 + 0.0316588*age - 0.0002959*age^2
age <- seq(from = 17, to = 90, by = 1)
second_order<- data.frame(age)
second_order$income_above <- -0.49214 + 0.0316588*second_order$age - 0.0002959*second_order$age^2
ggplot(age_rates, aes(y=income_above, x=age)) + geom_point() +geom_line(data = second_order) + labs(title="%Income above 50K vs Age + Age^2")
age_rates$age3 = age_rates$age^3
quadraticModel3 <- lm(income_above ~ age + age2 + age3, data=age_rates)
summary(quadraticModel3)
##
## Call:
## lm(formula = income_above ~ age + age2 + age3, data = age_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.118078 -0.042849 -0.004719 0.033859 0.260369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.188e-01 1.195e-01 -7.691 7.35e-11 ***
## age 6.215e-02 8.040e-03 7.731 6.21e-11 ***
## age2 -9.301e-04 1.628e-04 -5.712 2.60e-07 ***
## age3 3.979e-06 1.015e-06 3.921 0.000205 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06464 on 69 degrees of freedom
## Multiple R-squared: 0.7901, Adjusted R-squared: 0.7809
## F-statistic: 86.56 on 3 and 69 DF, p-value: < 2.2e-16
#Adj. R2 = 0.78 and third order term is significant
#income = -0.9188 + 0.06215*age - 0.0009301*age^2 + 0.000003979*age^3
third_order<- data.frame(age)
third_order$income_above <- -0.9188 + 0.06215*third_order$age - 0.0009301*third_order$age^2 + 0.000003979*age^3
ggplot(age_rates, aes(y=income_above, x=age)) + geom_point() +geom_line(data = third_order) + ggtitle("%Income above 50K vs Age") + ylim(0, 0.5) + theme(plot.title=element_text(size=24,face="bold")) + annotate('text', x = 50, y = 0.48, label = 'Income above = -0.92 + 0.062*Age - 0.0009*Age^2 + 0.000004*Age^3')
## Warning: Removed 4 row(s) containing missing values (geom_path).
#How can we transform this for regression? or should we add Age^2 and Age^3 to model?
age_rates$income_above_odds = age_rates$income_above / (1 - age_rates$income_above)
age_rates$log_income_above_odds = log(age_rates$income_above_odds)
age_rates %>% ggplot(mapping=aes(y=log_income_above_odds, x=age)) + geom_point(size=1.5)+ labs(title="%Income above 50K vs Age")
Re-check logistic regression with Age^2 and Age^3 Both Terms are significant
AIC: 14568 vs original 14884 –>improved Accuracy : 0.8391 vs original 0.8382 –>improved Sensitivity : 0.9315 vs original 0.9321 –> dropped a little
Specificity : 0.5655 vs original 0.5640 –>improved
logit.set <- adult %>% select(-fnlwgt,-education,-marital_status,-relationship,-native_country)
logit.set$age2 = logit.set$age^2
logit.set$age3 = logit.set$age^3
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$workclass <- as.factor(logit.set$workclass)
logit.set$workclass <- as.numeric(logit.set$workclass)
#logit.set$education <- as.numeric(logit.set$education)
#logit.set$marriage_status <- as.numeric(logit.set$marriage_status)
logit.set$occupation <- as.factor(logit.set$occupation)
logit.set$occupation <- as.numeric(logit.set$occupation)
#logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$native_country <- as.numeric(logit.set$native_country)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 20 variables:
## $ age : int 36 23 24 56 40 39 32 33 44 63 ...
## $ workclass : num 5 5 5 5 5 5 5 7 5 5 ...
## $ education_num : int 11 9 13 5 9 10 11 9 9 5 ...
## $ occupation : num 4 13 4 1 3 9 3 5 15 9 ...
## $ race : num 5 5 5 5 5 3 5 5 5 5 ...
## $ sex : num 2 2 1 2 2 1 2 2 2 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 1887 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 50 40 50 40 40 40 44 70 50 20 ...
## $ work_sector : Factor w/ 5 levels "Government","Not_Working",..: 4 4 4 4 4 4 4 5 4 4 ...
## $ marriage_status : Factor w/ 3 levels "Married","Previously-Married",..: 1 2 3 2 2 3 3 3 2 2 ...
## $ collar : Factor w/ 4 levels "Blue","Other",..: 3 3 3 4 1 1 1 1 1 1 ...
## $ native_born : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 1 2 2 2 ...
## $ birthplace : Factor w/ 3 levels "Mexico","Other",..: 3 3 3 3 3 3 2 3 3 3 ...
## $ income.binary : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ capital_gain_bin : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ capital_loss_bin : Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ capital_gain_or_loss: Factor w/ 2 levels "No","Yes": 2 1 1 1 1 1 1 1 1 1 ...
## $ age2 : num 1296 529 576 3136 1600 ...
## $ age3 : num 46656 12167 13824 175616 64000 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.7176 -0.4897 -0.1980 -0.0271 3.8248
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.597e+01 8.113e-01 -19.680 < 2e-16 ***
## age 5.199e-01 4.772e-02 10.895 < 2e-16 ***
## workclass -8.338e-02 1.862e-02 -4.477 7.56e-06 ***
## education_num 2.654e-01 1.045e-02 25.385 < 2e-16 ***
## occupation -3.404e-02 5.872e-03 -5.798 6.70e-09 ***
## race 9.379e-02 2.888e-02 3.248 0.001164 **
## sex 2.182e-01 6.187e-02 3.527 0.000421 ***
## capital_gain 5.903e-04 3.558e-05 16.589 < 2e-16 ***
## capital_loss 1.472e-03 2.460e-04 5.983 2.20e-09 ***
## hours_per_week 2.454e-02 1.944e-03 12.624 < 2e-16 ***
## work_sectorNot_Working -1.094e+01 1.234e+02 -0.089 0.929335
## work_sectorOther -9.584e-01 1.749e-01 -5.481 4.24e-08 ***
## work_sectorPrivate 2.737e-01 6.820e-02 4.014 5.97e-05 ***
## work_sectorSelf_Employed 5.194e-02 1.000e-01 0.519 0.603544
## marriage_statusPreviously-Married -2.251e+00 7.151e-02 -31.482 < 2e-16 ***
## marriage_statusSingle -2.397e+00 7.751e-02 -30.927 < 2e-16 ***
## collarOther 1.231e+00 1.273e-01 9.669 < 2e-16 ***
## collarWhite 9.800e-01 5.528e-02 17.729 < 2e-16 ***
## collarWhite-support 2.063e-01 8.978e-02 2.298 0.021578 *
## native_bornYes 1.012e+00 2.766e-01 3.658 0.000254 ***
## birthplaceOther 9.033e-01 2.876e-01 3.141 0.001683 **
## birthplaceUSA NA NA NA NA
## capital_gain_binYes -1.874e+00 2.046e-01 -9.159 < 2e-16 ***
## capital_loss_binYes -1.668e+00 4.799e-01 -3.475 0.000511 ***
## capital_gain_or_lossYes NA NA NA NA
## age2 -8.343e-03 9.965e-04 -8.372 < 2e-16 ***
## age3 4.088e-05 6.685e-06 6.115 9.63e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25107 on 22792 degrees of freedom
## Residual deviance: 14307 on 22768 degrees of freedom
## AIC: 14357
##
## Number of Fisher Scoring iterations: 12
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6887 899
## 1 504 1478
##
## Accuracy : 0.8564
## 95% CI : (0.8493, 0.8633)
## No Information Rate : 0.7567
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5867
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9318
## Specificity : 0.6218
## Pos Pred Value : 0.8845
## Neg Pred Value : 0.7457
## Prevalence : 0.7567
## Detection Rate : 0.7051
## Detection Prevalence : 0.7971
## Balanced Accuracy : 0.7768
##
## 'Positive' Class : 0
##
fnlwgt
fnlwgt_rates <- adult %>% group_by(fnlwgt) %>% summarise(income_above = mean(income.binary))
fnlwgt_rates %>% ggplot(mapping=aes(y=income_above, x=fnlwgt)) + geom_point(size=1.5)+ labs(title="%Income above 50K vs fnlwgt")
#No relationship, as mentioned previously drop this variable
education_num
adult$income.binary<-0
adult$income.binary[adult$income==" >50K"] <- 1
education_num_rates <- adult %>% group_by(education_num) %>% summarise(income_above = mean(income.binary))
education_num_rates %>% ggplot(mapping=aes(y=income_above, x=education_num)) + geom_point(size=1.5)+ labs(title="%Income above 50K vs education_num")
#Note this relationship is not linear, check quadratic transformation
linearModel <- lm(income_above ~ education_num, data=education_num_rates)
summary(linearModel)
##
## Call:
## lm(formula = income_above ~ education_num, data = education_num_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.14649 -0.09872 -0.02087 0.08284 0.19918
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.166522 0.060428 -2.756 0.0155 *
## education_num 0.046781 0.006249 7.486 2.94e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1152 on 14 degrees of freedom
## Multiple R-squared: 0.8001, Adjusted R-squared: 0.7858
## F-statistic: 56.04 on 1 and 14 DF, p-value: 2.942e-06
#Adj. R2 =0.7858
education_num_rates$education_num2 = education_num_rates$education_num^2
quadraticModel2 <- lm(income_above ~ education_num + education_num2, data=education_num_rates)
summary(quadraticModel2)
##
## Call:
## lm(formula = income_above ~ education_num + education_num2, data = education_num_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.099364 -0.021572 -0.004801 0.028032 0.089210
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1005427 0.0405495 2.480 0.02764 *
## education_num -0.0422404 0.0109785 -3.848 0.00202 **
## education_num2 0.0052366 0.0006278 8.341 1.41e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04745 on 13 degrees of freedom
## Multiple R-squared: 0.9685, Adjusted R-squared: 0.9637
## F-statistic: 200 on 2 and 13 DF, p-value: 1.723e-10
#Adj. R2 = 0.9637 and second order term is significant
#income = 0.1005427 - 0.0422404*education_num - 0.0052366*education_num^2
education_num <- seq(from = 1, to = 16, by = 1)
second_order<- data.frame(education_num_rates)
second_order$income_above <- 0.1005427 - 0.0422404*second_order$education_num + 0.0052366*second_order$education_num^2
ggplot(education_num_rates, aes(y=income_above, x=education_num)) + geom_point() +geom_line(data = second_order) + ggtitle("%Income above 50K vs Education") + ylim(0, 0.75) + theme(plot.title=element_text(size=24,face="bold")) + annotate('text', x = 9, y = 0.65, label = 'Income above = 0.1 - 0.042*Education - 0.005*Education^2')
## Warning: Removed 1 row(s) containing missing values (geom_path).
#Second degree quadratic fits well, add Education^2 to logistic regression
hours_per_week
hours_per_week <- adult
hours_per_week$income.binary<-0
hours_per_week$income.binary[hours_per_week$income==" >50K"] <- 1
hours_per_week_rates <- hours_per_week %>% group_by(hours_per_week) %>% summarise(n=n(),income_above = mean(income.binary))
hours_per_week_rates %>% ggplot(mapping=aes(y=income_above, x=hours_per_week)) + geom_point(aes(size=n))+ labs(title="%Income above 50K vs hours_per_week")
#Doesn't appear linear, explore quadratic fits
linearModel <- lm(income_above ~ hours_per_week, data=hours_per_week_rates)
summary(linearModel)
##
## Call:
## lm(formula = income_above ~ hours_per_week, data = hours_per_week_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33418 -0.07409 -0.01386 0.10078 0.75309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0856118 0.0356319 2.403 0.0183 *
## hours_per_week 0.0026443 0.0006342 4.170 6.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1749 on 92 degrees of freedom
## Multiple R-squared: 0.159, Adjusted R-squared: 0.1498
## F-statistic: 17.39 on 1 and 92 DF, p-value: 6.89e-05
#Adj. R2 =0.15
hours_per_week_rates$hours_per_week2 = hours_per_week_rates$hours_per_week^2
quadraticModel2 <- lm(income_above ~ hours_per_week + hours_per_week2, data=hours_per_week_rates)
summary(quadraticModel2)
##
## Call:
## lm(formula = income_above ~ hours_per_week + hours_per_week2,
## data = hours_per_week_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29872 -0.07525 -0.00144 0.08581 0.69952
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.471e-02 5.121e-02 -0.873 0.38492
## hours_per_week 1.044e-02 2.379e-03 4.386 3.1e-05 ***
## hours_per_week2 -7.831e-05 2.314e-05 -3.384 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1657 on 91 degrees of freedom
## Multiple R-squared: 0.253, Adjusted R-squared: 0.2366
## F-statistic: 15.41 on 2 and 91 DF, p-value: 1.724e-06
#Adj. R2 = 0.2366 and second order term is significant
#income = -0.04471 + 0.01044*hours_per_week - 0.00007831*hours_per_week^2
hours_per_week <- seq(from = 1, to = 99, by = 1)
second_order<- data.frame(hours_per_week)
second_order$income_above <- -0.04471 + 0.01044*second_order$hours_per_week - 0.00007831*second_order$hours_per_week^2
ggplot(hours_per_week_rates, aes(y=income_above, x=hours_per_week)) + geom_point() +geom_line(data = second_order) + ggtitle("%Income above 50K vs hours_per_week") + theme(plot.title=element_text(size=24,face="bold")) + annotate('text', x = 50, y = 0.825, label = 'Income above = -0.04 + 0.01*Hours - 0.00008*Hours^2')
#Fit is not great, try adding term
hours_per_week_rates$hours_per_week3 = hours_per_week_rates$hours_per_week^3
quadraticModel3 <- lm(income_above ~ hours_per_week + hours_per_week2 + hours_per_week3, data=hours_per_week_rates)
summary(quadraticModel3)
##
## Call:
## lm(formula = income_above ~ hours_per_week + hours_per_week2 +
## hours_per_week3, data = hours_per_week_rates)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.35517 -0.06306 -0.00387 0.06889 0.65943
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.125e-02 6.712e-02 1.210 0.22928
## hours_per_week -4.452e-03 5.838e-03 -0.763 0.44768
## hours_per_week2 2.951e-04 1.365e-04 2.163 0.03323 *
## hours_per_week3 -2.499e-06 9.011e-07 -2.774 0.00674 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1599 on 90 degrees of freedom
## Multiple R-squared: 0.3118, Adjusted R-squared: 0.2889
## F-statistic: 13.59 on 3 and 90 DF, p-value: 2.17e-07
#Adj. R2 = 0.2889 but first order term is not significant
#income = -0.08125 - 0.004452*hours_per_week + 0.0002951*hours_per_week^2 - 0.000002499*hours_per_week^3
third_order<- data.frame(hours_per_week)
third_order$income_above <- -0.08125 - 0.004452*third_order$hours_per_week + 0.0002951*third_order$hours_per_week^2 - 0.000002499*third_order$hours_per_week^3
ggplot(hours_per_week_rates, aes(y=income_above, x=hours_per_week)) + geom_point() +geom_line(data = third_order) + labs(title="%Income above 50K vs hours_per_week + hours_per_week^3")
#Third order doesn't look like good fit. Try adding secod order into model, but non-parametric may be better with this variable
Re-run and compare new simplified model based on EDA above:
replace workclass with work_sector replace marital_status with marriage_status replace occupation with collar drop native_country, native_born, birthplace, none show as significant drop education, keep education_num *quadratics will be added to more complex model not part 1
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-workclass,-marital_status,-occupation,-native_country,-native_born, -birthplace,-capital_gain_bin,-capital_loss_bin,-capital_gain_or_loss,-education)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$work_sector <- as.factor(logit.set$work_sector)
logit.set$work_sector <- as.numeric(logit.set$work_sector)
logit.set$marriage_status <- as.factor(logit.set$marriage_status)
logit.set$marriage_status <- as.numeric(logit.set$marriage_status)
logit.set$collar <- as.factor(logit.set$collar)
logit.set$collar <- as.numeric(logit.set$collar)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 12 variables:
## $ age : int 39 19 55 26 55 45 37 60 19 26 ...
## $ education_num : int 11 10 10 10 9 9 3 6 9 9 ...
## $ relationship : num 5 4 5 1 1 1 5 1 2 6 ...
## $ race : num 3 5 3 5 5 2 5 5 5 5 ...
## $ sex : num 1 2 1 2 2 2 1 2 2 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 40 25 40 60 40 84 40 40 30 56 ...
## $ work_sector : num 4 4 1 4 4 4 4 4 4 4 ...
## $ marriage_status: num 2 3 2 1 1 1 2 1 3 1 ...
## $ collar : num 3 3 4 3 1 1 1 1 4 1 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3161 -0.5573 -0.2133 -0.0498 3.6514
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.968e+00 2.866e-01 -24.316 < 2e-16 ***
## age 1.992e-02 1.740e-03 11.448 < 2e-16 ***
## education_num 3.186e-01 9.680e-03 32.914 < 2e-16 ***
## relationship 6.795e-02 2.007e-02 3.386 0.00071 ***
## race 1.070e-01 2.692e-02 3.976 7.02e-05 ***
## sex 6.714e-01 8.172e-02 8.215 < 2e-16 ***
## capital_gain 3.201e-04 1.189e-05 26.912 < 2e-16 ***
## capital_loss 6.493e-04 4.362e-05 14.885 < 2e-16 ***
## hours_per_week 3.074e-02 1.793e-03 17.140 < 2e-16 ***
## work_sector -9.852e-03 1.712e-02 -0.575 0.56504
## marriage_status -1.560e+00 3.744e-02 -41.670 < 2e-16 ***
## collar 2.878e-01 2.141e-02 13.445 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25197 on 22792 degrees of freedom
## Residual deviance: 15191 on 22781 degrees of freedom
## AIC: 15215
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6896 1009
## 1 534 1329
##
## Accuracy : 0.842
## 95% CI : (0.8346, 0.8492)
## No Information Rate : 0.7606
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5337
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9281
## Specificity : 0.5684
## Pos Pred Value : 0.8724
## Neg Pred Value : 0.7134
## Prevalence : 0.7606
## Detection Rate : 0.7060
## Detection Prevalence : 0.8093
## Balanced Accuracy : 0.7483
##
## 'Positive' Class : 0
##
car::vif(model)
## age education_num relationship race sex
## 1.064376 1.170840 2.563745 1.011843 2.563878
## capital_gain capital_loss hours_per_week work_sector marriage_status
## 1.024415 1.007196 1.067436 1.055507 1.222177
## collar
## 1.238676
#Race & sex show VIF > 2.5
#Note work_sector shows as not significant now, re-run model without
logit.set <- adult %>% select(-fnlwgt,-workclass,-marital_status,-occupation,-native_country,-native_born, -birthplace,-capital_gain_bin,-capital_loss_bin,-capital_gain_or_loss,-education,-work_sector)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
#logit.set$work_sector <- as.factor(logit.set$work_sector)
#logit.set$work_sector <- as.numeric(logit.set$work_sector)
logit.set$marriage_status <- as.factor(logit.set$marriage_status)
logit.set$marriage_status <- as.numeric(logit.set$marriage_status)
logit.set$collar <- as.factor(logit.set$collar)
logit.set$collar <- as.numeric(logit.set$collar)
logit.set$relationship <- as.numeric(logit.set$relationship)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 11 variables:
## $ age : int 22 42 45 33 21 31 37 40 19 61 ...
## $ education_num : int 13 9 9 14 9 12 9 13 9 4 ...
## $ relationship : num 2 1 1 2 1 5 2 1 4 2 ...
## $ race : num 5 5 5 3 5 5 5 5 5 5 ...
## $ sex : num 2 2 2 2 2 2 2 2 2 2 ...
## $ capital_gain : int 0 0 15024 0 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 45 24 80 50 40 40 40 40 40 40 ...
## $ marriage_status: num 3 1 1 3 1 3 3 1 3 2 ...
## $ collar : num 3 1 3 3 1 1 1 1 1 1 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 1 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3017 -0.5569 -0.2210 -0.0537 3.6345
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.043e+00 2.755e-01 -25.560 < 2e-16 ***
## age 2.031e-02 1.736e-03 11.701 < 2e-16 ***
## education_num 3.166e-01 9.651e-03 32.805 < 2e-16 ***
## relationship 9.215e-02 1.981e-02 4.652 3.29e-06 ***
## race 1.055e-01 2.619e-02 4.029 5.60e-05 ***
## sex 7.008e-01 8.031e-02 8.726 < 2e-16 ***
## capital_gain 3.133e-04 1.173e-05 26.711 < 2e-16 ***
## capital_loss 6.685e-04 4.267e-05 15.667 < 2e-16 ***
## hours_per_week 2.907e-02 1.762e-03 16.495 < 2e-16 ***
## marriage_status -1.538e+00 3.672e-02 -41.887 < 2e-16 ***
## collar 2.881e-01 2.106e-02 13.680 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25192 on 22792 degrees of freedom
## Residual deviance: 15331 on 22782 degrees of freedom
## AIC: 15353
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6908 1017
## 1 520 1323
##
## Accuracy : 0.8426
## 95% CI : (0.8353, 0.8498)
## No Information Rate : 0.7604
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5342
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9300
## Specificity : 0.5654
## Pos Pred Value : 0.8717
## Neg Pred Value : 0.7179
## Prevalence : 0.7604
## Detection Rate : 0.7072
## Detection Prevalence : 0.8113
## Balanced Accuracy : 0.7477
##
## 'Positive' Class : 0
##
car::vif(model)
## age education_num relationship race sex
## 1.064281 1.155093 2.564606 1.007460 2.551689
## capital_gain capital_loss hours_per_week marriage_status collar
## 1.023458 1.006873 1.057627 1.216604 1.215319
#Relationship & sex show VIF > 2.5
Use LASSO for variable selection
lasso.set <- adult %>% select(-fnlwgt,-workclass,-marital_status,-occupation,-native_country,-education)
lasso.set <- na.omit(lasso.set)
lasso.set$marriage_status <- as.factor(lasso.set$marriage_status)
lasso.set$marriage_status <- as.numeric(lasso.set$marriage_status)
lasso.set$collar <- as.factor(lasso.set$collar)
lasso.set$collar <- as.numeric(lasso.set$collar)
lasso.set$relationship <- as.numeric(lasso.set$relationship)
lasso.set$race <- as.numeric(lasso.set$race)
lasso.set$sex <- as.numeric(lasso.set$sex)
trainIndices = sample(seq(1:length(lasso.set$income)),round(.7*length(lasso.set$income)))
lasso.train = lasso.set[trainIndices,]
lasso.test = lasso.set[-trainIndices,]
train.x <- model.matrix(income~.,lasso.train %>% select(-income.binary))
train.y<-lasso.train$income
cvfit <- cv.glmnet(train.x, train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -7.0120496537
## (Intercept) .
## age 0.0207773536
## education_num 0.3175104193
## relationship 0.0660283666
## race 0.1007831631
## sex 0.6329373865
## capital_gain 0.0004722468
## capital_loss 0.0010040584
## hours_per_week 0.0279456586
## work_sectorNot_Working -1.9522878790
## work_sectorOther -0.5952135132
## work_sectorPrivate 0.0440394362
## work_sectorSelf_Employed -0.1953732625
## marriage_status -1.5460063241
## collar 0.2579768111
## native_bornYes 0.2125298455
## birthplaceOther 0.1698659271
## birthplaceUSA 0.0924245971
## capital_gain_binYes -0.5514022989
## capital_loss_binYes .
## capital_gain_or_lossYes -0.6881352560
#CV misclassification error rate is little below .1
print("CV Error Rate:")
## [1] "CV Error Rate:"
cvfit$cvm[which(cvfit$lambda==cvfit$lambda.min)]
## [1] 0.1506164
#Optimal penalty
print("Penalty Value:")
## [1] "Penalty Value:"
cvfit$lambda.min
## [1] 0.0009346982
#For final model predictions go ahead and refit lasso using entire
#data set
finalmodel<-glmnet(train.x, train.y, family = "binomial",lambda=cvfit$lambda.min)
coef(finalmodel)
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -7.0166851978
## (Intercept) .
## age 0.0207685056
## education_num 0.3175022543
## relationship 0.0660951700
## race 0.1008509221
## sex 0.6329743413
## capital_gain 0.0004717995
## capital_loss 0.0010211854
## hours_per_week 0.0279461988
## work_sectorNot_Working -1.9526684802
## work_sectorOther -0.5955964884
## work_sectorPrivate 0.0440268449
## work_sectorSelf_Employed -0.1955276633
## marriage_status -1.5463036147
## collar 0.2579723601
## native_bornYes 0.1555271888
## birthplaceOther 0.1754400998
## birthplaceUSA 0.1544273523
## capital_gain_binYes -0.5150930903
## capital_loss_binYes .
## capital_gain_or_lossYes -0.7214634034
#Lasso dropped relationship and work_sector, re-run regression and compare
#Re-run regression with birthplace to compare to native_country
logit.set <- adult %>% select(-fnlwgt,-workclass,-marital_status,-occupation,-native_country,-native_born, -birthplace,-capital_gain_bin,-capital_loss_bin,-capital_gain_or_loss,-education,-relationship,-work_sector)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$marriage_status <- as.factor(logit.set$marriage_status)
logit.set$marriage_status <- as.numeric(logit.set$marriage_status)
logit.set$collar <- as.factor(logit.set$collar)
logit.set$collar <- as.numeric(logit.set$collar)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 10 variables:
## $ age : int 24 32 56 29 48 19 51 38 39 35 ...
## $ education_num : int 9 14 10 9 10 9 9 13 9 10 ...
## $ race : num 5 5 5 5 5 3 5 3 3 3 ...
## $ sex : num 2 1 2 2 2 1 2 1 2 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 1887 0 ...
## $ hours_per_week : int 50 40 40 40 40 30 40 30 40 30 ...
## $ marriage_status: num 1 3 1 3 1 3 2 1 1 3 ...
## $ collar : num 3 3 1 1 1 3 1 3 1 1 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 2 1 2 1 1 2 2 1 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3935 -0.5511 -0.2170 -0.0505 3.6777
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.724e+00 2.309e-01 -29.126 < 2e-16 ***
## age 2.026e-02 1.729e-03 11.720 < 2e-16 ***
## education_num 3.242e-01 9.670e-03 33.529 < 2e-16 ***
## race 1.140e-01 2.680e-02 4.253 2.11e-05 ***
## sex 5.338e-01 5.682e-02 9.394 < 2e-16 ***
## capital_gain 3.271e-04 1.207e-05 27.096 < 2e-16 ***
## capital_loss 6.959e-04 4.319e-05 16.112 < 2e-16 ***
## hours_per_week 2.908e-02 1.786e-03 16.284 < 2e-16 ***
## marriage_status -1.511e+00 3.707e-02 -40.775 < 2e-16 ***
## collar 2.772e-01 2.127e-02 13.034 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25197 on 22792 degrees of freedom
## Residual deviance: 15177 on 22783 degrees of freedom
## AIC: 15197
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6925 1057
## 1 505 1281
##
## Accuracy : 0.8401
## 95% CI : (0.8327, 0.8473)
## No Information Rate : 0.7606
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5222
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9320
## Specificity : 0.5479
## Pos Pred Value : 0.8676
## Neg Pred Value : 0.7172
## Prevalence : 0.7606
## Detection Rate : 0.7089
## Detection Prevalence : 0.8172
## Balanced Accuracy : 0.7400
##
## 'Positive' Class : 0
##
car::vif(model)
## age education_num race sex capital_gain
## 1.049200 1.160241 1.004848 1.242659 1.022520
## capital_loss hours_per_week marriage_status collar
## 1.007653 1.055445 1.210008 1.219155
#Race & sex show VIF > 2.5
More complex model: Use simple reduced model from LASSO as starting point then add: Age quadratic terms age^2 and age^3 Education_num quadratic terms education_num^2 hours_per_week quadratic terms hours_per_week^2
logit.set <- adult %>% select(-fnlwgt,-workclass,-marital_status,-occupation,-native_country,-native_born, -birthplace,-capital_gain_bin,-capital_loss_bin,-capital_gain_or_loss,-education,-relationship,-work_sector)
logit.set <- na.omit(logit.set)
logit.set$income.binary<-0
logit.set$income.binary[logit.set$income==" >50K"] <- 1
logit.set$income.binary = as.factor(logit.set$income.binary)
logit.set <- logit.set %>% select(-income)
logit.set$age2 = logit.set$age^2
logit.set$age3 = logit.set$age^3
logit.set$education_num2 = logit.set$education_num^2
logit.set$hours_per_week2 = logit.set$hours_per_week^2
logit.set$marriage_status <- as.factor(logit.set$marriage_status)
logit.set$marriage_status <- as.numeric(logit.set$marriage_status)
logit.set$collar <- as.factor(logit.set$collar)
logit.set$collar <- as.numeric(logit.set$collar)
logit.set$race <- as.numeric(logit.set$race)
logit.set$sex <- as.numeric(logit.set$sex)
#logit.set$birthplace <- as.numeric(logit.set$birthplace)
trainIndices = sample(seq(1:length(logit.set$income.binary)),round(.7*length(logit.set$income.binary)))
logit.train = logit.set[trainIndices,]
logit.test = logit.set[-trainIndices,]
model <- glm(income.binary ~.,family=binomial(link='logit'),data=logit.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
str(logit.train)
## 'data.frame': 22793 obs. of 14 variables:
## $ age : int 25 29 28 22 34 37 24 17 21 33 ...
## $ education_num : int 12 9 9 9 10 13 8 5 10 12 ...
## $ race : num 5 3 5 5 5 5 5 3 5 3 ...
## $ sex : num 2 2 2 2 2 1 2 1 2 2 ...
## $ capital_gain : int 0 0 0 4416 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week : int 20 45 40 40 40 42 55 40 20 60 ...
## $ marriage_status: num 3 3 1 3 3 3 3 3 3 2 ...
## $ collar : num 1 1 3 1 4 2 3 1 1 3 ...
## $ income.binary : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ age2 : num 625 841 784 484 1156 ...
## $ age3 : num 15625 24389 21952 10648 39304 ...
## $ education_num2 : num 144 81 81 81 100 169 64 25 100 144 ...
## $ hours_per_week2: num 400 2025 1600 1600 1600 ...
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = logit.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3393 -0.5329 -0.1983 -0.0217 3.7996
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.483e+01 7.980e-01 -18.581 < 2e-16 ***
## age 4.815e-01 4.670e-02 10.310 < 2e-16 ***
## education_num 1.456e-01 5.280e-02 2.757 0.00583 **
## race 1.112e-01 2.650e-02 4.195 2.73e-05 ***
## sex 4.968e-01 5.769e-02 8.612 < 2e-16 ***
## capital_gain 3.191e-04 1.200e-05 26.594 < 2e-16 ***
## capital_loss 6.968e-04 4.375e-05 15.929 < 2e-16 ***
## hours_per_week 8.213e-02 7.402e-03 11.095 < 2e-16 ***
## marriage_status -1.408e+00 3.851e-02 -36.554 < 2e-16 ***
## collar 2.821e-01 2.130e-02 13.243 < 2e-16 ***
## age2 -7.983e-03 9.794e-04 -8.151 3.61e-16 ***
## age3 4.052e-05 6.595e-06 6.143 8.07e-10 ***
## education_num2 7.476e-03 2.487e-03 3.006 0.00265 **
## hours_per_week2 -5.930e-04 7.238e-05 -8.193 2.56e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25151 on 22792 degrees of freedom
## Residual deviance: 14928 on 22779 degrees of freedom
## AIC: 14956
##
## Number of Fisher Scoring iterations: 7
logit.test$IncomeProbability <- predict(model, newdata = logit.test, type = "response")
logit.test["Prediction"] = 0
logit.test$Prediction[logit.test$IncomeProbability>0.5] = 1
logit.test$Prediction=as.factor(logit.test$Prediction)
logit.test$income=as.factor(logit.test$income.binary)
confusionMatrix(logit.test$Prediction, logit.test$income.binary)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6945 983
## 1 465 1375
##
## Accuracy : 0.8518
## 95% CI : (0.8446, 0.8588)
## No Information Rate : 0.7586
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5625
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9372
## Specificity : 0.5831
## Pos Pred Value : 0.8760
## Neg Pred Value : 0.7473
## Prevalence : 0.7586
## Detection Rate : 0.7110
## Detection Prevalence : 0.8116
## Balanced Accuracy : 0.7602
##
## 'Positive' Class : 0
##
car::vif(model)
## age education_num race sex capital_gain
## 628.354638 35.728579 1.007272 1.281727 1.027163
## capital_loss hours_per_week marriage_status collar age2
## 1.010633 15.794578 1.237711 1.234265 2395.747241
## age3 education_num2 hours_per_week2
## 620.780877 35.708705 15.422782
Import train/test splits created by Cameron
train_na_rm = read.csv("https://raw.githubusercontent.com/rickfontenot/Predicting_Income/main/data/Test_Train_Set/NAs_Removed/train_na_rm.csv", header = TRUE)
test_na_rm = read.csv("https://raw.githubusercontent.com/rickfontenot/Predicting_Income/main/data/Test_Train_Set/NAs_Removed/test_na_rm.csv", header = TRUE)
train_na_as_cat = read.csv("https://raw.githubusercontent.com/rickfontenot/Predicting_Income/main/data/Test_Train_Set/NAs_as_a_category/train_na_as_cat.csv", header = TRUE)
test_na_as_cat = read.csv("https://raw.githubusercontent.com/rickfontenot/Predicting_Income/main/data/Test_Train_Set/NAs_as_a_category/test_na_as_cat.csv", header = TRUE)
Create Feature lists for models
features.simple = c("age","education_num","race","sex","capital_gain","capital_loss","hours_per_week","marriage_status","collar")
features.complex = c("age","education_num","race","sex","capital_gain","capital_loss","hours_per_week","marriage_status","collar","age2","age3","education_num2","hours_per_week2")
Add features from EDA to the train/test sets
#train_na_rm
train_na_rm$income.binary<-0
train_na_rm$income.binary[train_na_rm$income==" >50K"] <- 1
train_na_rm$income.binary = as.factor(train_na_rm$income.binary)
train_na_rm$marriage_status <- "Other"
train_na_rm$marriage_status[train_na_rm$marital_status %in% c("Married-AF-spouse","Married-civ-spouse")] <- "Married"
train_na_rm$marriage_status[train_na_rm$marital_status %in% c("Divorced","Married-spouse-absent","Separated","Widowed")] <- "Previously-Married"
train_na_rm$marriage_status[train_na_rm$marital_status %in% c("Never-married")] <- "Single"
train_na_rm$collar <- "Other"
train_na_rm$collar[train_na_rm$occupation %in% c("Adm-clerical")] <- "White-support"
train_na_rm$collar[train_na_rm$occupation %in% c("Exec-managerial","Prof-specialty","Protective-serv","Sales","Tech-support
")] <- "White"
train_na_rm$collar[train_na_rm$occupation %in% c("Armed-Forces
","Craft-repair","Farming-fishing","Handlers-cleaners","Machine-op-inspct","Other-service","Priv-house-serv","Transport-moving")] <- "Blue"
train_na_rm$age2 = train_na_rm$age^2
train_na_rm$age3 = train_na_rm$age^3
train_na_rm$education_num2 = train_na_rm$education_num^2
train_na_rm$hours_per_week2 = train_na_rm$hours_per_week^2
train_na_rm$race <- as.factor(train_na_rm$race)
train_na_rm$sex <- as.factor(train_na_rm$sex)
train_na_rm$marriage_status <- as.factor(train_na_rm$marriage_status)
train_na_rm$collar <- as.factor(train_na_rm$collar)
#test_na_rm
test_na_rm$income.binary<-0
test_na_rm$income.binary[test_na_rm$income==" >50K"] <- 1
test_na_rm$income.binary = as.factor(test_na_rm$income.binary)
test_na_rm$marriage_status <- "Other"
test_na_rm$marriage_status[test_na_rm$marital_status %in% c("Married-AF-spouse","Married-civ-spouse")] <- "Married"
test_na_rm$marriage_status[test_na_rm$marital_status %in% c("Divorced","Married-spouse-absent","Separated","Widowed")] <- "Previously-Married"
test_na_rm$marriage_status[test_na_rm$marital_status %in% c("Never-married")] <- "Single"
test_na_rm$collar <- "Other"
test_na_rm$collar[test_na_rm$occupation %in% c("Adm-clerical")] <- "White-support"
test_na_rm$collar[test_na_rm$occupation %in% c("Exec-managerial","Prof-specialty","Protective-serv","Sales","Tech-support
")] <- "White"
test_na_rm$collar[test_na_rm$occupation %in% c("Armed-Forces
","Craft-repair","Farming-fishing","Handlers-cleaners","Machine-op-inspct","Other-service","Priv-house-serv","Transport-moving")] <- "Blue"
test_na_rm$age2 = test_na_rm$age^2
test_na_rm$age3 = test_na_rm$age^3
test_na_rm$education_num2 = test_na_rm$education_num^2
test_na_rm$hours_per_week2 = test_na_rm$hours_per_week^2
test_na_rm$race <- as.factor(test_na_rm$race)
test_na_rm$sex <- as.factor(test_na_rm$sex)
test_na_rm$marriage_status <- as.factor(test_na_rm$marriage_status)
test_na_rm$collar <- as.factor(test_na_rm$collar)
#train_na_as_cat
train_na_as_cat$income.binary<-0
train_na_as_cat$income.binary[train_na_as_cat$income==" >50K"] <- 1
train_na_as_cat$income.binary = as.factor(train_na_as_cat$income.binary)
#Evaluate reducing levels to Government, Private, Self-Employed, Other
train_na_as_cat$workclass <- trimws(train_na_as_cat$workclass)
train_na_as_cat$work_sector <- "Other"
train_na_as_cat$work_sector[train_na_as_cat$workclass %in% c("Federal-gov","Local-gov","State-gov")] <- "Government"
train_na_as_cat$work_sector[train_na_as_cat$workclass %in% c("Private")] <- "Private"
train_na_as_cat$work_sector[train_na_as_cat$workclass %in% c("Self-emp-inc","Self-emp-not-inc")] <- "Self_Employed"
train_na_as_cat$work_sector[train_na_as_cat$workclass %in% c("Never-worked","Without-pay")] <- "Not_Working"
train_na_as_cat$work_sector = as.factor(train_na_as_cat$work_sector)
train_na_as_cat$marriage_status <- "Other"
train_na_as_cat$marriage_status[train_na_as_cat$marital_status %in% c("Married-AF-spouse","Married-civ-spouse")] <- "Married"
train_na_as_cat$marriage_status[train_na_as_cat$marital_status %in% c("Divorced","Married-spouse-absent","Separated","Widowed")] <- "Previously-Married"
train_na_as_cat$marriage_status[train_na_as_cat$marital_status %in% c("Never-married")] <- "Single"
train_na_as_cat$collar <- "Other"
train_na_as_cat$collar[train_na_as_cat$occupation %in% c("Adm-clerical")] <- "White-support"
train_na_as_cat$collar[train_na_as_cat$occupation %in% c("Exec-managerial","Prof-specialty","Protective-serv","Sales","Tech-support
")] <- "White"
train_na_as_cat$collar[train_na_as_cat$occupation %in% c("Armed-Forces
","Craft-repair","Farming-fishing","Handlers-cleaners","Machine-op-inspct","Other-service","Priv-house-serv","Transport-moving")] <- "Blue"
train_na_as_cat$age2 = train_na_as_cat$age^2
train_na_as_cat$age3 = train_na_as_cat$age^3
train_na_as_cat$education_num2 = train_na_as_cat$education_num^2
train_na_as_cat$hours_per_week2 = train_na_as_cat$hours_per_week^2
train_na_as_cat$race <- as.factor(train_na_as_cat$race)
train_na_as_cat$sex <- as.factor(train_na_as_cat$sex)
train_na_as_cat$marriage_status <- as.factor(train_na_as_cat$marriage_status)
train_na_as_cat$collar <- as.factor(train_na_as_cat$collar)
#test_na_as_cat
test_na_as_cat$income.binary<-0
test_na_as_cat$income.binary[test_na_as_cat$income==" >50K"] <- 1
test_na_as_cat$income.binary = as.factor(test_na_as_cat$income.binary)
#Evaluate reducing levels to Government, Private, Self-Employed, Other
test_na_as_cat$workclass <- trimws(test_na_as_cat$workclass)
test_na_as_cat$work_sector <- "Other"
test_na_as_cat$work_sector[test_na_as_cat$workclass %in% c("Federal-gov","Local-gov","State-gov")] <- "Government"
test_na_as_cat$work_sector[test_na_as_cat$workclass %in% c("Private")] <- "Private"
test_na_as_cat$work_sector[test_na_as_cat$workclass %in% c("Self-emp-inc","Self-emp-not-inc")] <- "Self_Employed"
test_na_as_cat$work_sector[test_na_as_cat$workclass %in% c("Never-worked","Without-pay")] <- "Not_Working"
test_na_as_cat$work_sector = as.factor(test_na_as_cat$work_sector)
test_na_as_cat$marriage_status <- "Other"
test_na_as_cat$marriage_status[test_na_as_cat$marital_status %in% c("Married-AF-spouse","Married-civ-spouse")] <- "Married"
test_na_as_cat$marriage_status[test_na_as_cat$marital_status %in% c("Divorced","Married-spouse-absent","Separated","Widowed")] <- "Previously-Married"
test_na_as_cat$marriage_status[test_na_as_cat$marital_status %in% c("Never-married")] <- "Single"
test_na_as_cat$collar <- "Other"
test_na_as_cat$collar[test_na_as_cat$occupation %in% c("Adm-clerical")] <- "White-support"
test_na_as_cat$collar[test_na_as_cat$occupation %in% c("Exec-managerial","Prof-specialty","Protective-serv","Sales","Tech-support
")] <- "White"
test_na_as_cat$collar[test_na_as_cat$occupation %in% c("Armed-Forces
","Craft-repair","Farming-fishing","Handlers-cleaners","Machine-op-inspct","Other-service","Priv-house-serv","Transport-moving")] <- "Blue"
test_na_as_cat$age2 = test_na_as_cat$age^2
test_na_as_cat$age3 = test_na_as_cat$age^3
test_na_as_cat$education_num2 = test_na_as_cat$education_num^2
test_na_as_cat$hours_per_week2 = test_na_as_cat$hours_per_week^2
test_na_as_cat$race <- as.factor(test_na_as_cat$race)
test_na_as_cat$sex <- as.factor(test_na_as_cat$sex)
test_na_as_cat$marriage_status <- as.factor(test_na_as_cat$marriage_status)
test_na_as_cat$collar <- as.factor(test_na_as_cat$collar)
Base Logistic regression model with all original variables for comparison
original.variables <- c("age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country")
dat.train.x <- train_na_as_cat %>% select(original.variables)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(original.variables)` instead of `original.variables` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dat.train.x %>% summarise_all(funs(sum(is.na(.))))
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## age workclass fnlwgt education education_num marital_status occupation
## 1 0 0 0 0 0 0 0
## relationship race sex capital_gain capital_loss hours_per_week native_country
## 1 0 0 0 0 0 0 0
dat.train.x <- mutate_if(dat.train.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.train.y <- train_na_as_cat$income
dat.train.y <- as.factor(as.character(dat.train.y))
dat.train.x %>% summarise_all(funs(sum(is.na(.))))
## age workclass fnlwgt education education_num marital_status occupation
## 1 0 0 0 0 0 0 0
## relationship race sex capital_gain capital_loss hours_per_week native_country
## 1 0 0 0 0 0 0 0
#glmnet requires a matrix
dat.train.x <- data.matrix(dat.train.x)
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 15 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -7.969947e+00
## age 3.150658e-02
## workclass -4.713516e-02
## fnlwgt 2.770859e-07
## education 1.008746e-02
## education_num 3.171810e-01
## marital_status -2.264256e-01
## occupation .
## relationship -1.150098e-01
## race 7.718834e-02
## sex 8.162277e-01
## capital_gain 2.590585e-04
## capital_loss 6.392205e-04
## hours_per_week 2.876879e-02
## native_country .
#Get training set predictions...We know they are biased but lets create ROC's.
#These are predicted probabilities from logistic model exp(b)/(1+exp(b))
fit.pred <- predict(cvfit, newx = dat.train.x, type = "response")
#Compare the prediction to the real outcome
head(fit.pred)
## lambda.1se
## [1,] 0.009915691
## [2,] 0.219400680
## [3,] 0.074142555
## [4,] 0.242003851
## [5,] 0.052182072
## [6,] 0.218610947
head(dat.train.y)
## [1] <=50K >50K <=50K <=50K <=50K <=50K
## Levels: <=50K >50K
#Create ROC curves
pred <- prediction(fit.pred[,1], dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
#Plot ROC
plot(roc.perf)
abline(a=0, b= 1) #Ref line indicating poor performance
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Get Test Set
dat.val1.x <- test_na_as_cat %>% select(original.variables)
dat.val1.x <- mutate_if(dat.val1.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.val1.x <- data.matrix(dat.val1.x)
dat.val1.y <- test_na_as_cat$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
#Run model from training set on valid set I
fit.pred1 <- predict(cvfit, newx = dat.val1.x, type = "response")
#ROC curves
pred1 <- prediction(fit.pred1[,1], dat.val1.y)
roc.perf1 = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.val1 <- performance(pred1, measure = "auc")
auc.val1 <- auc.val1@y.values
plot(roc.perf1)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.val1[[1]],3), sep = ""))
significance_check <- train_na_as_cat %>% select(original.variables,"income")
significance_check$income.binary<-0
significance_check$income.binary[significance_check$income==">50K"] <- 1
significance_check <- significance_check %>% select(-income)
significance_check <- mutate_if(significance_check, is.factor, ~ as.numeric(as.factor(.x)))
significance_check <- mutate_if(significance_check, is.character, ~ as.numeric(as.factor(.x)))
model <- glm(income.binary ~.,family=binomial(link='logit'),data=significance_check)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = significance_check)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3075 -0.6183 -0.3409 -0.0904 3.2944
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.685e+00 2.664e-01 -32.603 < 2e-16 ***
## age 3.339e-02 1.509e-03 22.127 < 2e-16 ***
## workclass -6.389e-02 1.216e-02 -5.254 1.49e-07 ***
## fnlwgt 5.338e-07 1.715e-07 3.112 0.00186 **
## education 1.858e-02 5.814e-03 3.196 0.00139 **
## education_num 3.318e-01 8.086e-03 41.032 < 2e-16 ***
## marital_status -2.438e-01 1.373e-02 -17.755 < 2e-16 ***
## occupation -3.330e-04 4.079e-03 -0.082 0.93495
## relationship -1.187e-01 1.604e-02 -7.396 1.40e-13 ***
## race 1.050e-01 2.385e-02 4.402 1.07e-05 ***
## sex 8.829e-01 5.628e-02 15.688 < 2e-16 ***
## capital_gain 3.143e-04 1.079e-05 29.130 < 2e-16 ***
## capital_loss 6.932e-04 3.772e-05 18.378 < 2e-16 ***
## hours_per_week 3.086e-02 1.567e-03 19.693 < 2e-16 ***
## native_country 2.636e-03 3.008e-03 0.876 0.38099
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28706 on 26048 degrees of freedom
## Residual deviance: 19914 on 26034 degrees of freedom
## AIC: 19944
##
## Number of Fisher Scoring iterations: 7
#occupation and native_country are not significant
significance_check2 <- significance_check %>% select(-fnlwgt)
model2 <- glm(income.binary ~.,family=binomial(link='logit'),data=significance_check2)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model2)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = significance_check2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2947 -0.6195 -0.3409 -0.0904 3.2838
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.539e+00 2.617e-01 -32.625 < 2e-16 ***
## age 3.308e-02 1.505e-03 21.982 < 2e-16 ***
## workclass -6.480e-02 1.215e-02 -5.332 9.73e-08 ***
## education 1.847e-02 5.814e-03 3.177 0.00149 **
## education_num 3.313e-01 8.083e-03 40.991 < 2e-16 ***
## marital_status -2.435e-01 1.373e-02 -17.738 < 2e-16 ***
## occupation -2.786e-04 4.079e-03 -0.068 0.94555
## relationship -1.187e-01 1.604e-02 -7.402 1.34e-13 ***
## race 1.036e-01 2.380e-02 4.354 1.34e-05 ***
## sex 8.872e-01 5.626e-02 15.771 < 2e-16 ***
## capital_gain 3.143e-04 1.078e-05 29.148 < 2e-16 ***
## capital_loss 6.916e-04 3.769e-05 18.349 < 2e-16 ***
## hours_per_week 3.065e-02 1.565e-03 19.593 < 2e-16 ***
## native_country 2.218e-03 3.005e-03 0.738 0.46049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28706 on 26048 degrees of freedom
## Residual deviance: 19924 on 26035 degrees of freedom
## AIC: 19952
##
## Number of Fisher Scoring iterations: 7
Re-run Base model without CV
#Setup Training set
base.train <- train_na_as_cat %>% select(original.variables,"income")
base.train <- mutate_if(base.train, is.character, ~ (as.factor(.x)))
base.train <- mutate_if(base.train, is.factor, ~ as.numeric(as.factor(.x)))
base.train$income <- base.train$income-1
#Run Model
base.model1 <- glm(income ~ .,family="binomial",data=base.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Setup Test set
base.test <- test_na_as_cat %>% select(original.variables,"income")
base.test <- mutate_if(base.test, is.character, ~ (as.factor(.x)))
base.test <- mutate_if(base.test, is.factor, ~ as.numeric(as.factor(.x)))
base.test$income <- base.test$income-1
#Make Predictions and check performance
predprobs<-predict(base.model1,base.test,type="response")
base.roc<-roc(response=base.test$income,predictor=predprobs,levels=c(0,1))
## Setting direction: controls < cases
plot(base.roc,print.thres="best", main="Base Logistic Regression") #This graph is nice because the x axis is plotted in terms of specificity rather than FPR
text(x = .40, y = .6,paste("AUC-test = ", round(auc(base.roc),3), sep = ""))
auc(base.roc)
## Area under the curve: 0.8538
#AUC=0.905
threshold=0.214
log.predclass<-ifelse(predprobs>threshold,1,0)
log.predclass<-factor(log.predclass)
confusionMatrix(as.factor(log.predclass),as.factor(base.test$income))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3663 297
## 1 1258 1294
##
## Accuracy : 0.7612
## 95% CI : (0.7507, 0.7715)
## No Information Rate : 0.7557
## P-Value [Acc > NIR] : 0.1529
##
## Kappa : 0.4631
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7444
## Specificity : 0.8133
## Pos Pred Value : 0.9250
## Neg Pred Value : 0.5071
## Prevalence : 0.7557
## Detection Rate : 0.5625
## Detection Prevalence : 0.6081
## Balanced Accuracy : 0.7788
##
## 'Positive' Class : 0
##
From above: Drop fnlwgt not a relevant predictor of income Drop native_country (not significant)
From EDA replace workclass with work_sector replace marital_status with marriage_status replace occupation with collar Drop Education, redundant with education_num
dat.train.x <- train_na_as_cat %>% select(original.variables,"work_sector","marriage_status","collar")
dat.train.x = dat.train.x %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country")
dat.train.x <- mutate_if(dat.train.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.train.y <- train_na_as_cat$income
dat.train.y <- as.factor(as.character(dat.train.y))
dat.train.x %>% summarise_all(funs(sum(is.na(.))))
## age education_num relationship race sex capital_gain capital_loss
## 1 0 0 0 0 0 0 0
## hours_per_week work_sector marriage_status collar
## 1 0 0 0 0
#glmnet requires a matrix
dat.train.x <- data.matrix(dat.train.x)
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 12 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -6.3031233883
## age 0.0178718559
## education_num 0.3116364168
## relationship 0.0493979035
## race 0.0704642615
## sex 0.5360340797
## capital_gain 0.0002923497
## capital_loss 0.0006557260
## hours_per_week 0.0297254142
## work_sector -0.0098634403
## marriage_status -1.5230345419
## collar 0.2630667782
#Get training set predictions...We know they are biased but lets create ROC's.
#These are predicted probabilities from logistic model exp(b)/(1+exp(b))
fit.pred <- predict(cvfit, newx = dat.train.x, type = "response")
#Compare the prediction to the real outcome
head(fit.pred)
## lambda.1se
## [1,] 0.006013594
## [2,] 0.276863035
## [3,] 0.021296663
## [4,] 0.074593546
## [5,] 0.012394220
## [6,] 0.067474730
head(dat.train.y)
## [1] <=50K >50K <=50K <=50K <=50K <=50K
## Levels: <=50K >50K
#Create ROC curves
pred <- prediction(fit.pred[,1], dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
#Plot ROC
plot(roc.perf)
abline(a=0, b= 1) #Ref line indicating poor performance
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Get Test Set
dat.val1.x <- test_na_as_cat %>% select(original.variables,"work_sector","marriage_status","collar")
dat.val1.x = dat.val1.x %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country")
dat.val1.x <- mutate_if(dat.val1.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.val1.x <- data.matrix(dat.val1.x)
dat.val1.y <- test_na_as_cat$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
#Run model from training set on valid set I
fit.pred1 <- predict(cvfit, newx = dat.val1.x, type = "response")
#ROC curves
pred1 <- prediction(fit.pred1[,1], dat.val1.y)
roc.perf1 = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.val1 <- performance(pred1, measure = "auc")
auc.val1 <- auc.val1@y.values
plot(roc.perf1)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.val1[[1]],3), sep = ""))
dat.train.x <- train_na_as_cat %>% select(original.variables,"work_sector","marriage_status","collar")
dat.train.x = dat.train.x %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country")
significance_check3 <- train_na_as_cat %>% select(original.variables,"work_sector","marriage_status","collar","income")
significance_check3 = significance_check3 %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country")
significance_check3$income.binary<-0
significance_check3$income.binary[significance_check3$income==">50K"] <- 1
significance_check3 <- significance_check3 %>% select(-income)
significance_check3 <- mutate_if(significance_check3, is.factor, ~ as.numeric(as.factor(.x)))
significance_check3 <- mutate_if(significance_check3, is.character, ~ as.numeric(as.factor(.x)))
model3 <- glm(income.binary ~.,family=binomial(link='logit'),data=significance_check3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model3)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = significance_check3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3271 -0.5543 -0.2146 -0.0534 3.3818
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.889e+00 2.677e-01 -25.731 < 2e-16 ***
## age 1.930e-02 1.622e-03 11.902 < 2e-16 ***
## education_num 3.185e-01 9.064e-03 35.141 < 2e-16 ***
## relationship 9.135e-02 1.891e-02 4.831 1.36e-06 ***
## race 8.943e-02 2.495e-02 3.585 0.000338 ***
## sex 6.845e-01 7.723e-02 8.863 < 2e-16 ***
## capital_gain 3.226e-04 1.106e-05 29.155 < 2e-16 ***
## capital_loss 6.873e-04 4.088e-05 16.812 < 2e-16 ***
## hours_per_week 3.126e-02 1.682e-03 18.584 < 2e-16 ***
## work_sector -1.929e-02 1.596e-02 -1.209 0.226742
## marriage_status -1.573e+00 3.503e-02 -44.903 < 2e-16 ***
## collar 2.753e-01 1.998e-02 13.781 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28706 on 26048 degrees of freedom
## Residual deviance: 17337 on 26037 degrees of freedom
## AIC: 17361
##
## Number of Fisher Scoring iterations: 7
#Work_sector is not significant
From above: Drop work_sector
dat.train.x <- train_na_as_cat %>% select(original.variables,"marriage_status","collar")
dat.train.x = dat.train.x %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country",-"relationship")
#dat.train.x <- mutate_if(dat.train.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.train.x <- mutate_if(dat.train.x, is.character, ~ as.factor(.x))
dat.train.y <- train_na_as_cat$income
dat.train.y <- as.factor(as.character(dat.train.y))
dat.train.x %>% summarise_all(funs(sum(is.na(.))))
## age education_num race sex capital_gain capital_loss hours_per_week
## 1 0 0 0 0 0 0 0
## marriage_status collar
## 1 0 0
#glmnet requires a matrix
dat.train.x <- data.matrix(dat.train.x)
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -5.8792668264
## age 0.0171369741
## education_num 0.3098782739
## race 0.0626796588
## sex 0.3861383873
## capital_gain 0.0002856130
## capital_loss 0.0006476984
## hours_per_week 0.0290806745
## marriage_status -1.5042598459
## collar 0.2623570399
#Get training set predictions...We know they are biased but lets create ROC's.
#These are predicted probabilities from logistic model exp(b)/(1+exp(b))
fit.pred <- predict(cvfit, newx = dat.train.x, type = "response")
#Compare the prediction to the real outcome
head(fit.pred)
## lambda.1se
## [1,] 0.005649833
## [2,] 0.274986143
## [3,] 0.020392296
## [4,] 0.073250700
## [5,] 0.011816526
## [6,] 0.065863652
head(dat.train.y)
## [1] <=50K >50K <=50K <=50K <=50K <=50K
## Levels: <=50K >50K
#Create ROC curves
pred <- prediction(fit.pred[,1], dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
#Plot ROC
plot(roc.perf)
abline(a=0, b= 1) #Ref line indicating poor performance
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Get Test Set
dat.val1.x <- test_na_as_cat %>% select(original.variables,"marriage_status","collar")
dat.val1.x = dat.val1.x %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country",-"relationship")
dat.val1.x <- mutate_if(dat.val1.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.val1.x <- data.matrix(dat.val1.x)
dat.val1.y <- test_na_as_cat$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
#Run model from training set on valid set I
fit.pred1 <- predict(cvfit, newx = dat.val1.x, type = "response")
#ROC curves
pred1 <- prediction(fit.pred1[,1], dat.val1.y)
roc.perf1 = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.val1 <- performance(pred1, measure = "auc")
auc.val1 <- auc.val1@y.values
plot(roc.perf1)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.val1[[1]],3), sep = ""))
dat.train.x <- train_na_as_cat %>% select(original.variables,"marriage_status","collar")
dat.train.x = dat.train.x %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country")
significance_check4 <- train_na_as_cat %>% select(original.variables,"marriage_status","collar","income")
significance_check4 = significance_check4 %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country",-"relationship")
significance_check4$income.binary<-0
significance_check4$income.binary[significance_check4$income==">50K"] <- 1
significance_check4 <- significance_check4 %>% select(-income)
#significance_check4 <- mutate_if(significance_check4, is.factor, ~ as.numeric(as.factor(.x)))
significance_check4 <- mutate_if(significance_check4, is.character, ~ as.factor(.x))
model3 <- glm(income.binary ~.,family=binomial(link='logit'),data=significance_check4)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model3)
##
## Call:
## glm(formula = income.binary ~ ., family = binomial(link = "logit"),
## data = significance_check4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.2827 -0.5271 -0.2207 -0.0665 3.2753
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.889e+00 2.822e-01 -24.413 < 2e-16 ***
## age 2.195e-02 1.670e-03 13.147 < 2e-16 ***
## education_num 2.905e-01 9.407e-03 30.883 < 2e-16 ***
## raceAsian-Pac-Islander 3.692e-01 2.653e-01 1.392 0.163902
## raceBlack 4.732e-01 2.541e-01 1.862 0.062573 .
## raceOther 2.204e-02 3.728e-01 0.059 0.952853
## raceWhite 5.464e-01 2.434e-01 2.245 0.024753 *
## sexMale 1.722e-01 5.563e-02 3.095 0.001965 **
## capital_gain 3.188e-04 1.110e-05 28.724 < 2e-16 ***
## capital_loss 6.841e-04 4.124e-05 16.590 < 2e-16 ***
## hours_per_week 3.006e-02 1.698e-03 17.701 < 2e-16 ***
## marriage_statusPreviously-Married -2.139e+00 6.443e-02 -33.205 < 2e-16 ***
## marriage_statusSingle -2.758e+00 7.016e-02 -39.316 < 2e-16 ***
## collarOther 2.716e-01 8.080e-02 3.362 0.000775 ***
## collarWhite 8.125e-01 4.796e-02 16.941 < 2e-16 ***
## collarWhite-support 4.125e-01 7.652e-02 5.390 7.03e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28706 on 26048 degrees of freedom
## Residual deviance: 17121 on 26033 degrees of freedom
## AIC: 17153
##
## Number of Fisher Scoring iterations: 7
Re-Run Simple Logistic regression on full train set instead of using CV
#Setup Training set
simple1.train <- train_na_as_cat %>% select(original.variables,"marriage_status","collar","income")
simple1.train = simple1.train %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country",-"relationship")
simple1.train$income <- as.factor(simple1.train$income)
#Run Model
simple.model1 <- glm(income ~ .,family="binomial",data=simple1.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Setup Test set
simple1.test <- test_na_as_cat %>% select(original.variables,"marriage_status","collar","income")
simple1.test = simple1.test %>% select(-"workclass",-"marital_status",-"occupation",-"education",-"fnlwgt",-"native_country",-"relationship")
simple1.test$income <- as.factor(simple1.test$income)
#Make Predictions and check performance
predprobs<-predict(simple.model1,simple1.test,type="response")
simple.roc<-roc(response=simple1.test$income,predictor=predprobs,levels=c("<=50K",">50K"))
## Setting direction: controls < cases
plot(simple.roc,print.thres="best", main="Simple Logistic Regression") #This graph is nice because the x axis is plotted in terms of specificity rather than FPR
text(x = .40, y = .6,paste("AUC-test = ", round(auc(simple.roc),3), sep = ""))
auc(simple.roc)
## Area under the curve: 0.8958
#AUC=0.896
threshold=0.218
log.predclass<-ifelse(predprobs>threshold,">50K","<=50K")
log.predclass<-factor(log.predclass)
confusionMatrix(log.predclass,simple1.test$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 3840 264
## >50K 1081 1327
##
## Accuracy : 0.7935
## 95% CI : (0.7834, 0.8032)
## No Information Rate : 0.7557
## P-Value [Acc > NIR] : 2.764e-13
##
## Kappa : 0.5234
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7803
## Specificity : 0.8341
## Pos Pred Value : 0.9357
## Neg Pred Value : 0.5511
## Prevalence : 0.7557
## Detection Rate : 0.5897
## Detection Prevalence : 0.6302
## Balanced Accuracy : 0.8072
##
## 'Positive' Class : <=50K
##
Run Complex Logistic regression with NA removed, add AUC metric
dat.train.x <- train_na_rm %>% select(features.complex)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(features.complex)` instead of `features.complex` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dat.train.x <- mutate_if(dat.train.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.train.y <- train_na_rm$income
dat.train.y <- as.factor(as.character(dat.train.y))
#glmnet requires a matrix
dat.train.x <- as.matrix(dat.train.x)
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -8.164080e+00
## age 9.022949e-02
## education_num 1.892264e-01
## race 8.003025e-02
## sex 4.435066e-01
## capital_gain 2.937493e-04
## capital_loss 6.288669e-04
## hours_per_week 5.684486e-02
## marriage_status -1.429570e+00
## collar 2.704879e-01
## age2 .
## age3 -1.069414e-05
## education_num2 5.776112e-03
## hours_per_week2 -3.479606e-04
#Get training set predictions...We know they are biased but lets create ROC's.
#These are predicted probabilities from logistic model exp(b)/(1+exp(b))
fit.pred <- predict(cvfit, newx = dat.train.x, type = "response")
#Compare the prediction to the real outcome
head(fit.pred)
## lambda.1se
## [1,] 0.343368090
## [2,] 0.106388067
## [3,] 0.006189991
## [4,] 0.021903661
## [5,] 0.437790104
## [6,] 0.011292144
head(dat.train.y)
## [1] <=50K <=50K <=50K <=50K >50K <=50K
## Levels: <=50K >50K
#Create ROC curves
pred <- prediction(fit.pred[,1], dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
#Plot ROC
plot(roc.perf)
abline(a=0, b= 1) #Ref line indicating poor performance
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Get Test Set
dat.val1.x <- test_na_rm %>% select(features.complex)
dat.val1.x <- mutate_if(dat.val1.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.val1.x <- as.matrix(dat.val1.x)
dat.val1.y <- test_na_rm$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
#Run model from training set on valid set I
fit.pred1 <- predict(cvfit, newx = dat.val1.x, type = "response")
#ROC curves
pred1 <- prediction(fit.pred1[,1], dat.val1.y)
roc.perf1 = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.val1 <- performance(pred1, measure = "auc")
auc.val1 <- auc.val1@y.values
plot(roc.perf1)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.val1[[1]],3), sep = ""))
Run Simple Logistic regression with NA as category, add AUC metric
dat.train.x <- train_na_as_cat %>% select(features.simple)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(features.simple)` instead of `features.simple` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dat.train.x <- mutate_if(dat.train.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.train.y <- train_na_as_cat$income
dat.train.y <- as.factor(as.character(dat.train.y))
#glmnet requires a matrix
dat.train.x <- as.matrix(dat.train.x)
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 10 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -5.6249810542
## age 0.0162249860
## education_num 0.3041975904
## race 0.0486353885
## sex 0.3651504431
## capital_gain 0.0002604788
## capital_loss 0.0006200047
## hours_per_week 0.0279950699
## marriage_status -1.4710727460
## collar 0.2490688174
#Get training set predictions...We know they are biased but lets create ROC's.
#These are predicted probabilities from logistic model exp(b)/(1+exp(b))
fit.pred <- predict(cvfit, newx = dat.train.x, type = "response")
#Compare the prediction to the real outcome
head(fit.pred)
## lambda.1se
## [1,] 0.004670232
## [2,] 0.269717137
## [3,] 0.016935185
## [4,] 0.069326265
## [5,] 0.010199770
## [6,] 0.061186994
head(dat.train.y)
## [1] <=50K >50K <=50K <=50K <=50K <=50K
## Levels: <=50K >50K
#Create ROC curves
pred <- prediction(fit.pred[,1], dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
#Plot ROC
plot(roc.perf)
abline(a=0, b= 1) #Ref line indicating poor performance
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Get Test Set
dat.val1.x <- test_na_as_cat %>% select(features.simple)
dat.val1.x <- mutate_if(dat.val1.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.val1.x <- as.matrix(dat.val1.x)
dat.val1.y <- test_na_as_cat$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
#Run model from training set on valid set I
fit.pred1 <- predict(cvfit, newx = dat.val1.x, type = "response")
#ROC curves
pred1 <- prediction(fit.pred1[,1], dat.val1.y)
roc.perf1 = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.val1 <- performance(pred1, measure = "auc")
auc.val1 <- auc.val1@y.values
plot(roc.perf1)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.val1[[1]],3), sep = ""))
Run Complex Logistic regression with NA as category, add AUC metric
dat.train.x <- train_na_rm %>% select(features.complex)
dat.train.x <- dat.train.x %>% select(-age2)
#dat.train.x$sex_marriage <- paste(dat.train.x$sex, "_", dat.train.x$marriage_status)
#dat.train.x$sex_marriage <- as.factor(dat.train.x$sex_marriage)
#dat.train.x$relationship <- as.factor(dat.train.x$relationship)
dat.train.x <- mutate_if(dat.train.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.train.y <- train_na_rm$income
dat.train.y <- as.factor(as.character(dat.train.y))
#glmnet requires a matrix
dat.train.x <- as.matrix(dat.train.x)
set.seed(284)
flds <- createFolds(dat.train.y, k = 5, list = TRUE, returnTrain = FALSE)
foldids = rep(1,length(dat.train.y))
foldids[flds$Fold2] = 2
foldids[flds$Fold3] = 3
foldids[flds$Fold4] = 4
foldids[flds$Fold5] = 5
cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000)
#cvfit <- cv.glmnet(dat.train.x, dat.train.y, family = "binomial", type.measure = "class", nlambda = 1000,foldid = foldids)
plot(cvfit)
coef(cvfit, s = "lambda.min")
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -8.253238e+00
## age 9.155500e-02
## education_num 1.898890e-01
## race 8.101149e-02
## sex 4.445380e-01
## capital_gain 2.954228e-04
## capital_loss 6.306582e-04
## hours_per_week 5.846497e-02
## marriage_status -1.431178e+00
## collar 2.712842e-01
## age3 -1.088501e-05
## education_num2 5.761275e-03
## hours_per_week2 -3.641796e-04
#Get training set predictions...We know they are biased but lets create ROC's.
#These are predicted probabilities from logistic model exp(b)/(1+exp(b))
fit.pred <- predict(cvfit, newx = dat.train.x, type = "response")
#Compare the prediction to the real outcome
head(fit.pred)
## lambda.1se
## [1,] 0.361106566
## [2,] 0.097771655
## [3,] 0.004554286
## [4,] 0.020135860
## [5,] 0.449555445
## [6,] 0.009541062
head(dat.train.y)
## [1] <=50K <=50K <=50K <=50K >50K <=50K
## Levels: <=50K >50K
#Create ROC curves
pred <- prediction(fit.pred[,1], dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
#Plot ROC
plot(roc.perf,print.thres="best")
abline(a=0, b= 1) #Ref line indicating poor performance
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Get Test Set
dat.val1.x <- test_na_rm %>% select(features.complex)
dat.val1.x <- dat.val1.x %>% select(-age2)
#dat.val1.x$sex_marriage <- paste(dat.val1.x$sex, "_", dat.val1.x$marriage_status)
#dat.val1.x$sex_marriage <- as.factor(dat.val1.x$sex_marriage)
#dat.val1.x$relationship <- as.factor(dat.val1.x$relationship)
dat.val1.x <- mutate_if(dat.val1.x, is.factor, ~ as.numeric(as.factor(.x)))
dat.val1.x <- as.matrix(dat.val1.x)
dat.val1.y <- test_na_rm$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
#Run model from training set on valid set I
fit.pred1 <- predict(cvfit, newx = dat.val1.x, type = "response")
#ROC curves
pred1 <- prediction(fit.pred1[,1], dat.val1.y)
roc.perf1 = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.val1 <- performance(pred1, measure = "auc")
auc.val1 <- auc.val1@y.values
plot(roc.perf1,print.thres="best")
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.val1[[1]],3), sep = ""))
test.roc <- roc(predictor = fit.pred1, response = dat.val1.y, levels = (levels(dat.val1.y)))
## Warning in roc.default(predictor = fit.pred1, response = dat.val1.y, levels =
## (levels(dat.val1.y))): Deprecated use a matrix as predictor. Unexpected results
## may be produced, please pass a numeric vector.
## Setting direction: controls < cases
plot(test.roc,print.thres="best",ylim=c(0,1))
text(x = .40, y = .6,paste("AUC-test = ", round(auc.val1[[1]],3), sep = ""))
threshold=0.5
log.predclass<-ifelse(fit.pred1>threshold,">50K","<=50K")
log.predclass<-factor(log.predclass)
confusionMatrix(log.predclass,dat.val1.y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 4250 672
## >50K 275 835
##
## Accuracy : 0.843
## 95% CI : (0.8336, 0.8521)
## No Information Rate : 0.7502
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5408
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9392
## Specificity : 0.5541
## Pos Pred Value : 0.8635
## Neg Pred Value : 0.7523
## Prevalence : 0.7502
## Detection Rate : 0.7046
## Detection Prevalence : 0.8160
## Balanced Accuracy : 0.7467
##
## 'Positive' Class : <=50K
##
Re-run Complex model without CV
#Setup Training set
complex.train <- train_na_as_cat %>% select(features.complex,"income")
complex.train$income <- as.factor(complex.train$income)
#Run Model
complex.model1 <- glm(income ~ .,family="binomial",data=complex.train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#Setup Test set
complex.test <- test_na_as_cat %>% select(features.complex,"income")
complex.test$income <- as.factor(complex.test$income)
#Make Predictions and check performance
predprobs<-predict(complex.model1,complex.test,type="response")
complex.roc<-roc(response=complex.test$income,predictor=predprobs,levels=c("<=50K",">50K"))
## Setting direction: controls < cases
plot(complex.roc,print.thres="best", main="Complex Logistic Regression") #This graph is nice because the x axis is plotted in terms of specificity rather than FPR
text(x = .40, y = .6,paste("AUC-test = ", round(auc(complex.roc),3), sep = ""))
auc(complex.roc)
## Area under the curve: 0.9016
#AUC=0.9016
threshold=0.211
log.predclass<-ifelse(predprobs>threshold,">50K","<=50K")
log.predclass<-factor(log.predclass)
confusionMatrix(log.predclass,complex.test$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 3792 215
## >50K 1129 1376
##
## Accuracy : 0.7936
## 95% CI : (0.7836, 0.8034)
## No Information Rate : 0.7557
## P-Value [Acc > NIR] : 2.217e-13
##
## Kappa : 0.532
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7706
## Specificity : 0.8649
## Pos Pred Value : 0.9463
## Neg Pred Value : 0.5493
## Prevalence : 0.7557
## Detection Rate : 0.5823
## Detection Prevalence : 0.6153
## Balanced Accuracy : 0.8177
##
## 'Positive' Class : <=50K
##
Re-run Complex feature logistic regression including all possible interaction terms to look for significant ones
interaction_check_set <- train_na_rm %>% select(income,features.complex)
interaction_check_set$income.binary<-0
#interaction_check_set$income.binary[interaction_check_set$income==" >50K"] <- 1
interaction_check_set$income.binary[interaction_check_set$income==">50K"] <- 1
interaction_check_set$income.binary <- as.factor(interaction_check_set$income.binary)
interaction_check_set <- interaction_check_set %>% select(-income)
model <- glm(income.binary ~.*.,family=binomial(link='logit'),data=interaction_check_set)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model)
##
## Call:
## glm(formula = income.binary ~ . * ., family = binomial(link = "logit"),
## data = interaction_check_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.2315 -0.5073 -0.1462 -0.0002 3.7494
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error
## (Intercept) -3.542e+01 2.137e+01
## age 1.730e+00 1.439e+00
## education_num 2.116e+00 2.687e+00
## raceAsian-Pac-Islander 2.729e+00 1.425e+01
## raceBlack -1.370e+01 1.423e+01
## raceOther -2.681e+01 2.540e+01
## raceWhite -6.305e+00 1.371e+01
## sexMale -2.086e+00 2.567e+00
## capital_gain -2.362e-04 3.470e-04
## capital_loss 1.772e-03 2.017e-03
## hours_per_week 5.774e-01 3.886e-01
## marriage_statusPreviously-Married 2.030e+00 3.276e+00
## marriage_statusSingle -3.635e+00 3.026e+00
## collarOther -1.694e+01 3.723e+02
## collarWhite -6.304e+00 2.592e+00
## collarWhite-support -6.730e+00 4.109e+00
## age2 -6.265e-02 4.069e-02
## age3 1.208e-03 6.505e-04
## education_num2 -8.631e-02 1.279e-01
## hours_per_week2 -4.390e-03 3.866e-03
## age:education_num -2.556e-02 1.550e-01
## age:raceAsian-Pac-Islander 1.256e-01 9.173e-01
## age:raceBlack 8.856e-01 9.098e-01
## age:raceOther 1.496e+00 1.573e+00
## age:raceWhite 5.032e-01 8.841e-01
## age:sexMale -3.365e-02 1.428e-01
## age:capital_gain 3.037e-05 2.253e-05
## age:capital_loss 4.345e-05 1.005e-04
## age:hours_per_week -1.714e-02 1.677e-02
## age:marriage_statusPreviously-Married -3.154e-01 1.872e-01
## age:marriage_statusSingle 5.893e-02 1.631e-01
## age:collarOther 3.409e-01 4.211e-01
## age:collarWhite 6.000e-01 1.572e-01
## age:collarWhite-support 4.761e-01 2.321e-01
## age:age2 NA NA
## age:age3 -1.146e-05 6.241e-06
## age:education_num2 1.156e-03 7.289e-03
## age:hours_per_week2 8.591e-05 1.632e-04
## education_num:raceAsian-Pac-Islander -1.103e-01 7.501e-01
## education_num:raceBlack -4.505e-01 7.490e-01
## education_num:raceOther 3.410e-02 9.175e-01
## education_num:raceWhite 2.105e-02 7.152e-01
## education_num:sexMale -5.236e-03 2.083e-01
## education_num:capital_gain 3.766e-05 1.519e-05
## education_num:capital_loss 3.858e-05 1.254e-04
## education_num:hours_per_week -2.638e-02 2.438e-02
## education_num:marriage_statusPreviously-Married -1.904e-01 2.059e-01
## education_num:marriage_statusSingle -8.460e-01 2.287e-01
## education_num:collarOther -3.224e-01 5.838e-01
## education_num:collarWhite -3.410e-01 1.647e-01
## education_num:collarWhite-support 1.351e-01 3.630e-01
## education_num:age2 3.191e-04 3.104e-03
## education_num:age3 -1.898e-06 2.002e-05
## education_num:education_num2 7.734e-04 8.714e-04
## education_num:hours_per_week2 2.369e-04 2.206e-04
## raceAsian-Pac-Islander:sexMale -1.103e-01 7.242e-01
## raceBlack:sexMale 5.156e-01 7.007e-01
## raceOther:sexMale 2.016e+00 2.387e+00
## raceWhite:sexMale 3.043e-01 6.639e-01
## raceAsian-Pac-Islander:capital_gain 9.217e-06 1.242e-04
## raceBlack:capital_gain 1.597e-04 1.192e-04
## raceOther:capital_gain 4.482e-04 4.126e-04
## raceWhite:capital_gain 1.624e-05 1.068e-04
## raceAsian-Pac-Islander:capital_loss -1.339e-03 1.130e-03
## raceBlack:capital_loss -1.147e-03 1.128e-03
## raceOther:capital_loss -2.163e-03 1.486e-03
## raceWhite:capital_loss -1.444e-03 1.107e-03
## raceAsian-Pac-Islander:hours_per_week -9.532e-02 2.518e-01
## raceBlack:hours_per_week 3.691e-02 2.542e-01
## raceOther:hours_per_week 9.823e-02 4.899e-01
## raceWhite:hours_per_week -6.502e-02 2.490e-01
## raceAsian-Pac-Islander:marriage_statusPreviously-Married -4.040e-01 7.923e-01
## raceBlack:marriage_statusPreviously-Married -1.613e+00 7.468e-01
## raceOther:marriage_statusPreviously-Married -1.148e+01 1.122e+02
## raceWhite:marriage_statusPreviously-Married -1.016e+00 6.973e-01
## raceAsian-Pac-Islander:marriage_statusSingle -3.112e-01 9.875e-01
## raceBlack:marriage_statusSingle -1.292e+00 9.702e-01
## raceOther:marriage_statusSingle -3.809e-01 1.660e+00
## raceWhite:marriage_statusSingle -5.194e-01 9.146e-01
## raceAsian-Pac-Islander:collarOther 1.447e+01 3.722e+02
## raceBlack:collarOther 1.462e+01 3.722e+02
## raceOther:collarOther 6.395e+00 5.687e+02
## raceWhite:collarOther 1.479e+01 3.722e+02
## raceAsian-Pac-Islander:collarWhite -1.611e-01 6.990e-01
## raceBlack:collarWhite 2.501e-01 6.827e-01
## raceOther:collarWhite -7.061e-02 1.374e+00
## raceWhite:collarWhite 4.974e-02 6.389e-01
## raceAsian-Pac-Islander:collarWhite-support -8.643e-01 9.931e-01
## raceBlack:collarWhite-support 1.294e-02 9.561e-01
## raceOther:collarWhite-support 1.008e+00 2.387e+00
## raceWhite:collarWhite-support -3.229e-01 9.103e-01
## raceAsian-Pac-Islander:age2 -2.474e-03 2.262e-02
## raceBlack:age2 -1.723e-02 2.244e-02
## raceOther:age2 -2.937e-02 3.570e-02
## raceWhite:age2 -9.383e-03 2.201e-02
## raceAsian-Pac-Islander:age3 1.793e-05 1.801e-04
## raceBlack:age3 1.079e-04 1.789e-04
## raceOther:age3 1.888e-04 2.627e-04
## raceWhite:age3 5.711e-05 1.767e-04
## raceAsian-Pac-Islander:education_num2 -3.354e-03 3.497e-02
## raceBlack:education_num2 2.150e-02 3.509e-02
## raceOther:education_num2 -1.121e-02 4.643e-02
## raceWhite:education_num2 -4.808e-03 3.332e-02
## raceAsian-Pac-Islander:hours_per_week2 9.559e-04 2.657e-03
## raceBlack:hours_per_week2 1.412e-04 2.670e-03
## raceOther:hours_per_week2 -1.201e-03 5.416e-03
## raceWhite:hours_per_week2 8.020e-04 2.631e-03
## sexMale:capital_gain -1.161e-05 3.214e-05
## sexMale:capital_loss 9.250e-05 1.359e-04
## sexMale:hours_per_week 7.233e-02 2.228e-02
## sexMale:marriage_statusPreviously-Married 1.222e+00 1.639e-01
## sexMale:marriage_statusSingle 1.096e+00 1.904e-01
## sexMale:collarOther 6.230e-02 3.578e-01
## sexMale:collarWhite -3.534e-01 1.878e-01
## sexMale:collarWhite-support -4.455e-01 2.221e-01
## sexMale:age2 1.024e-03 3.043e-03
## sexMale:age3 -6.110e-06 2.080e-05
## sexMale:education_num2 -1.364e-03 9.398e-03
## sexMale:hours_per_week2 -5.584e-04 2.303e-04
## capital_gain:capital_loss NA NA
## capital_gain:hours_per_week -9.564e-07 3.042e-06
## capital_gain:marriage_statusPreviously-Married 4.675e-05 3.181e-05
## capital_gain:marriage_statusSingle 1.291e-04 3.772e-05
## capital_gain:collarOther -5.822e-05 7.383e-05
## capital_gain:collarWhite -5.036e-05 2.830e-05
## capital_gain:collarWhite-support -1.198e-05 4.604e-05
## capital_gain:age2 -6.481e-07 4.642e-07
## capital_gain:age3 4.104e-09 3.057e-09
## capital_gain:education_num2 -1.927e-06 7.816e-07
## capital_gain:hours_per_week2 -4.345e-09 3.050e-08
## capital_loss:hours_per_week -3.130e-05 1.542e-05
## capital_loss:marriage_statusPreviously-Married -1.859e-04 1.371e-04
## capital_loss:marriage_statusSingle -3.071e-06 1.465e-04
## capital_loss:collarOther 2.284e-04 2.752e-04
## capital_loss:collarWhite 7.764e-05 1.117e-04
## capital_loss:collarWhite-support -2.414e-04 1.899e-04
## capital_loss:age2 -8.554e-07 2.064e-06
## capital_loss:age3 4.960e-09 1.352e-08
## capital_loss:education_num2 -9.533e-07 5.754e-06
## capital_loss:hours_per_week2 3.097e-07 1.510e-07
## hours_per_week:marriage_statusPreviously-Married 5.416e-02 2.785e-02
## hours_per_week:marriage_statusSingle 1.208e-01 4.338e-02
## hours_per_week:collarOther -2.686e-02 4.549e-02
## hours_per_week:collarWhite 1.280e-03 2.050e-02
## hours_per_week:collarWhite-support 1.626e-03 4.011e-02
## hours_per_week:age2 3.620e-04 3.267e-04
## hours_per_week:age3 -2.661e-06 2.034e-06
## hours_per_week:education_num2 1.117e-03 1.089e-03
## hours_per_week:hours_per_week2 5.909e-06 3.212e-06
## marriage_statusPreviously-Married:collarOther 5.409e-03 3.816e-01
## marriage_statusSingle:collarOther -3.751e-01 4.643e-01
## marriage_statusPreviously-Married:collarWhite 8.901e-02 1.790e-01
## marriage_statusSingle:collarWhite 1.820e-01 2.397e-01
## marriage_statusPreviously-Married:collarWhite-support -7.760e-01 2.894e-01
## marriage_statusSingle:collarWhite-support -2.920e-01 3.371e-01
## marriage_statusPreviously-Married:age2 5.864e-03 3.845e-03
## marriage_statusSingle:age2 -4.688e-04 3.508e-03
## marriage_statusPreviously-Married:age3 -2.976e-05 2.546e-05
## marriage_statusSingle:age3 9.454e-07 2.387e-05
## marriage_statusPreviously-Married:education_num2 8.509e-03 9.368e-03
## marriage_statusSingle:education_num2 3.899e-02 1.020e-02
## marriage_statusPreviously-Married:hours_per_week2 -2.325e-04 2.637e-04
## marriage_statusSingle:hours_per_week2 -9.000e-04 4.067e-04
## collarOther:age2 -7.301e-03 9.709e-03
## collarWhite:age2 -1.306e-02 3.468e-03
## collarWhite-support:age2 -1.027e-02 5.120e-03
## collarOther:age3 5.271e-05 7.210e-05
## collarWhite:age3 9.165e-05 2.473e-05
## collarWhite-support:age3 7.128e-05 3.648e-05
## collarOther:education_num2 1.585e-02 2.669e-02
## collarWhite:education_num2 1.900e-02 8.014e-03
## collarWhite-support:education_num2 -6.307e-03 1.694e-02
## collarOther:hours_per_week2 3.363e-04 4.454e-04
## collarWhite:hours_per_week2 -1.038e-05 1.922e-04
## collarWhite-support:hours_per_week2 1.223e-04 4.545e-04
## age2:age3 4.134e-08 2.392e-08
## age2:education_num2 -1.673e-05 1.464e-04
## age2:hours_per_week2 -2.254e-06 3.216e-06
## age3:education_num2 9.638e-08 9.469e-07
## age3:hours_per_week2 1.936e-08 2.033e-08
## education_num2:hours_per_week2 -1.124e-05 9.885e-06
## z value Pr(>|z|)
## (Intercept) -1.658 0.097407 .
## age 1.203 0.229073
## education_num 0.787 0.431029
## raceAsian-Pac-Islander 0.192 0.848091
## raceBlack -0.963 0.335679
## raceOther -1.056 0.291092
## raceWhite -0.460 0.645559
## sexMale -0.813 0.416428
## capital_gain -0.681 0.496007
## capital_loss 0.878 0.379705
## hours_per_week 1.486 0.137377
## marriage_statusPreviously-Married 0.620 0.535414
## marriage_statusSingle -1.201 0.229559
## collarOther -0.045 0.963713
## collarWhite -2.432 0.015018 *
## collarWhite-support -1.638 0.101441
## age2 -1.540 0.123613
## age3 1.856 0.063393 .
## education_num2 -0.675 0.499635
## hours_per_week2 -1.135 0.256166
## age:education_num -0.165 0.868972
## age:raceAsian-Pac-Islander 0.137 0.891076
## age:raceBlack 0.973 0.330365
## age:raceOther 0.951 0.341735
## age:raceWhite 0.569 0.569255
## age:sexMale -0.236 0.813697
## age:capital_gain 1.348 0.177527
## age:capital_loss 0.433 0.665372
## age:hours_per_week -1.022 0.306898
## age:marriage_statusPreviously-Married -1.684 0.092103 .
## age:marriage_statusSingle 0.361 0.717847
## age:collarOther 0.810 0.418151
## age:collarWhite 3.817 0.000135 ***
## age:collarWhite-support 2.051 0.040271 *
## age:age2 NA NA
## age:age3 -1.835 0.066438 .
## age:education_num2 0.159 0.873930
## age:hours_per_week2 0.526 0.598549
## education_num:raceAsian-Pac-Islander -0.147 0.883051
## education_num:raceBlack -0.601 0.547529
## education_num:raceOther 0.037 0.970348
## education_num:raceWhite 0.029 0.976520
## education_num:sexMale -0.025 0.979942
## education_num:capital_gain 2.479 0.013179 *
## education_num:capital_loss 0.308 0.758345
## education_num:hours_per_week -1.082 0.279150
## education_num:marriage_statusPreviously-Married -0.925 0.355206
## education_num:marriage_statusSingle -3.700 0.000216 ***
## education_num:collarOther -0.552 0.580793
## education_num:collarWhite -2.071 0.038398 *
## education_num:collarWhite-support 0.372 0.709755
## education_num:age2 0.103 0.918132
## education_num:age3 -0.095 0.924447
## education_num:education_num2 0.888 0.374786
## education_num:hours_per_week2 1.074 0.282805
## raceAsian-Pac-Islander:sexMale -0.152 0.878974
## raceBlack:sexMale 0.736 0.461850
## raceOther:sexMale 0.845 0.398320
## raceWhite:sexMale 0.458 0.646767
## raceAsian-Pac-Islander:capital_gain 0.074 0.940858
## raceBlack:capital_gain 1.339 0.180448
## raceOther:capital_gain 1.086 0.277328
## raceWhite:capital_gain 0.152 0.879159
## raceAsian-Pac-Islander:capital_loss -1.185 0.236047
## raceBlack:capital_loss -1.017 0.309140
## raceOther:capital_loss -1.456 0.145509
## raceWhite:capital_loss -1.304 0.192204
## raceAsian-Pac-Islander:hours_per_week -0.379 0.705026
## raceBlack:hours_per_week 0.145 0.884519
## raceOther:hours_per_week 0.200 0.841096
## raceWhite:hours_per_week -0.261 0.793950
## raceAsian-Pac-Islander:marriage_statusPreviously-Married -0.510 0.610149
## raceBlack:marriage_statusPreviously-Married -2.159 0.030817 *
## raceOther:marriage_statusPreviously-Married -0.102 0.918525
## raceWhite:marriage_statusPreviously-Married -1.457 0.144997
## raceAsian-Pac-Islander:marriage_statusSingle -0.315 0.752674
## raceBlack:marriage_statusSingle -1.331 0.183126
## raceOther:marriage_statusSingle -0.229 0.818540
## raceWhite:marriage_statusSingle -0.568 0.570096
## raceAsian-Pac-Islander:collarOther 0.039 0.968985
## raceBlack:collarOther 0.039 0.968671
## raceOther:collarOther 0.011 0.991028
## raceWhite:collarOther 0.040 0.968304
## raceAsian-Pac-Islander:collarWhite -0.231 0.817686
## raceBlack:collarWhite 0.366 0.714128
## raceOther:collarWhite -0.051 0.959010
## raceWhite:collarWhite 0.078 0.937948
## raceAsian-Pac-Islander:collarWhite-support -0.870 0.384126
## raceBlack:collarWhite-support 0.014 0.989204
## raceOther:collarWhite-support 0.422 0.672968
## raceWhite:collarWhite-support -0.355 0.722819
## raceAsian-Pac-Islander:age2 -0.109 0.912902
## raceBlack:age2 -0.767 0.442811
## raceOther:age2 -0.823 0.410733
## raceWhite:age2 -0.426 0.669950
## raceAsian-Pac-Islander:age3 0.100 0.920712
## raceBlack:age3 0.603 0.546358
## raceOther:age3 0.719 0.472334
## raceWhite:age3 0.323 0.746500
## raceAsian-Pac-Islander:education_num2 -0.096 0.923577
## raceBlack:education_num2 0.613 0.540032
## raceOther:education_num2 -0.242 0.809155
## raceWhite:education_num2 -0.144 0.885289
## raceAsian-Pac-Islander:hours_per_week2 0.360 0.719062
## raceBlack:hours_per_week2 0.053 0.957812
## raceOther:hours_per_week2 -0.222 0.824457
## raceWhite:hours_per_week2 0.305 0.760490
## sexMale:capital_gain -0.361 0.718075
## sexMale:capital_loss 0.681 0.496047
## sexMale:hours_per_week 3.247 0.001166 **
## sexMale:marriage_statusPreviously-Married 7.454 9.05e-14 ***
## sexMale:marriage_statusSingle 5.758 8.52e-09 ***
## sexMale:collarOther 0.174 0.861750
## sexMale:collarWhite -1.882 0.059809 .
## sexMale:collarWhite-support -2.006 0.044877 *
## sexMale:age2 0.337 0.736413
## sexMale:age3 -0.294 0.768950
## sexMale:education_num2 -0.145 0.884566
## sexMale:hours_per_week2 -2.425 0.015301 *
## capital_gain:capital_loss NA NA
## capital_gain:hours_per_week -0.314 0.753234
## capital_gain:marriage_statusPreviously-Married 1.470 0.141636
## capital_gain:marriage_statusSingle 3.423 0.000620 ***
## capital_gain:collarOther -0.789 0.430371
## capital_gain:collarWhite -1.779 0.075217 .
## capital_gain:collarWhite-support -0.260 0.794788
## capital_gain:age2 -1.396 0.162649
## capital_gain:age3 1.342 0.179465
## capital_gain:education_num2 -2.466 0.013673 *
## capital_gain:hours_per_week2 -0.142 0.886722
## capital_loss:hours_per_week -2.030 0.042339 *
## capital_loss:marriage_statusPreviously-Married -1.356 0.174972
## capital_loss:marriage_statusSingle -0.021 0.983271
## capital_loss:collarOther 0.830 0.406641
## capital_loss:collarWhite 0.695 0.486817
## capital_loss:collarWhite-support -1.271 0.203716
## capital_loss:age2 -0.414 0.678562
## capital_loss:age3 0.367 0.713670
## capital_loss:education_num2 -0.166 0.868398
## capital_loss:hours_per_week2 2.052 0.040207 *
## hours_per_week:marriage_statusPreviously-Married 1.945 0.051772 .
## hours_per_week:marriage_statusSingle 2.784 0.005366 **
## hours_per_week:collarOther -0.590 0.554935
## hours_per_week:collarWhite 0.062 0.950217
## hours_per_week:collarWhite-support 0.041 0.967672
## hours_per_week:age2 1.108 0.267883
## hours_per_week:age3 -1.308 0.190891
## hours_per_week:education_num2 1.026 0.304900
## hours_per_week:hours_per_week2 1.840 0.065801 .
## marriage_statusPreviously-Married:collarOther 0.014 0.988692
## marriage_statusSingle:collarOther -0.808 0.419147
## marriage_statusPreviously-Married:collarWhite 0.497 0.618992
## marriage_statusSingle:collarWhite 0.759 0.447682
## marriage_statusPreviously-Married:collarWhite-support -2.681 0.007334 **
## marriage_statusSingle:collarWhite-support -0.866 0.386333
## marriage_statusPreviously-Married:age2 1.525 0.127210
## marriage_statusSingle:age2 -0.134 0.893695
## marriage_statusPreviously-Married:age3 -1.168 0.242617
## marriage_statusSingle:age3 0.040 0.968408
## marriage_statusPreviously-Married:education_num2 0.908 0.363719
## marriage_statusSingle:education_num2 3.823 0.000132 ***
## marriage_statusPreviously-Married:hours_per_week2 -0.882 0.377934
## marriage_statusSingle:hours_per_week2 -2.213 0.026890 *
## collarOther:age2 -0.752 0.452086
## collarWhite:age2 -3.766 0.000166 ***
## collarWhite-support:age2 -2.005 0.044969 *
## collarOther:age3 0.731 0.464767
## collarWhite:age3 3.707 0.000210 ***
## collarWhite-support:age3 1.954 0.050738 .
## collarOther:education_num2 0.594 0.552599
## collarWhite:education_num2 2.370 0.017773 *
## collarWhite-support:education_num2 -0.372 0.709739
## collarOther:hours_per_week2 0.755 0.450197
## collarWhite:hours_per_week2 -0.054 0.956922
## collarWhite-support:hours_per_week2 0.269 0.787780
## age2:age3 1.728 0.083905 .
## age2:education_num2 -0.114 0.909045
## age2:hours_per_week2 -0.701 0.483473
## age3:education_num2 0.102 0.918931
## age3:hours_per_week2 0.952 0.341014
## education_num2:hours_per_week2 -1.138 0.255309
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27069 on 24129 degrees of freedom
## Residual deviance: 15156 on 23951 degrees of freedom
## AIC: 15514
##
## Number of Fisher Scoring iterations: 13
interaction_test_set <- test_na_rm %>% select(income,features.complex)
interaction_test_set$income.binary<-0
#interaction_test_set$income.binary[interaction_test_set$income==" >50K"] <- 1
interaction_test_set$income.binary[interaction_test_set$income==">50K"] <- 1
interaction_test_set$income.binary <- as.factor(interaction_test_set$income.binary)
interaction_test_set <- interaction_test_set %>% select(-income)
interaction_test_set$IncomeProbability <- predict(model, newdata = interaction_test_set, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
interaction_test_set["Prediction"] = 0
interaction_test_set$Prediction[interaction_test_set$IncomeProbability>0.5] = 1
interaction_test_set$Prediction=as.factor(interaction_test_set$Prediction)
interaction_test_set$income=as.factor(interaction_test_set$income)
confusionMatrix(interaction_test_set$Prediction, as.factor(interaction_test_set$income.binary))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4212 574
## 1 313 933
##
## Accuracy : 0.853
## 95% CI : (0.8438, 0.8618)
## No Information Rate : 0.7502
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5836
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9308
## Specificity : 0.6191
## Pos Pred Value : 0.8801
## Neg Pred Value : 0.7488
## Prevalence : 0.7502
## Detection Rate : 0.6983
## Detection Prevalence : 0.7934
## Balanced Accuracy : 0.7750
##
## 'Positive' Class : 0
##
#There are multiple interaction terms that show significance but the model dropped original variables in lieu of interactions, need to explore these interactions individually to see what relationship is
Plot potentially significant Interactions for futher EDA, candidates are: age x collar education_num x marriage_status sex x marriage_status (Should relationship be added back to model since this is similar?) capital_gain x marriage_status hours_per_week x marriage_status
train_na_rm %>% ggplot(aes(x=collar,y=age,color=income)) +
geom_boxplot()+ labs(title= "Age x Collar Interaction" , x = "Collar Group", y= "Age") +
theme(axis.text.x=element_text(angle=45,hjust=1))
#Note Age is higher for >50K at all levels, may not be true interaction
train_na_rm %>% ggplot(aes(x=marriage_status,y=education_num,color=income)) +
geom_boxplot()+ labs(title= "Education x Marriage Interaction" , x = "Marriage Group", y= "Education") +
theme(axis.text.x=element_text(angle=0,hjust=1))
#Note Education is higher for >50K at all levels, may not be true interaction
train_na_rm %>% ggplot(aes(x=marriage_status,y=hours_per_week,color=income)) +
geom_boxplot()+ labs(title= "Hours/Week x Marriage Interaction" , x = "Marriage Group", y= "Hours per Week") +
theme(axis.text.x=element_text(angle=0,hjust=1))
train_na_rm %>% ggplot(aes(x=marriage_status,y=capital_gain,color=income)) +
geom_boxplot()+ labs(title= "Capital Gain x Marriage Interaction" , x = "Marriage Group", y= "Capital Gain") +
theme(axis.text.x=element_text(angle=0,hjust=1))
LDA/QDA
Need to check for multivariate normality for assumptions Need to check covariance matrix for assumptions
Since response is imbalanced, re-run with priors to see if improvements
continuous.predictors = c("age","education_num","capital_gain","capital_loss","hours_per_week")
quadratic.predictors = c("age2","age3","education_num2","hours_per_week2")
#Test for multivariate normality
multivariate_sample <- train_na_as_cat %>% select(continuous.predictors)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(continuous.predictors)` instead of `continuous.predictors` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
multivariate_sample <- multivariate_sample[sample(1:nrow(multivariate_sample), 1000), ]
#install.packages("mvShapiroTest")
library(mvShapiroTest)
mvShapiro.Test(as.matrix(multivariate_sample))
##
## Generalized Shapiro-Wilk test for Multivariate Normality by
## Villasenor-Alva and Gonzalez-Estrada
##
## data: as.matrix(multivariate_sample)
## MVW = 0.63189, p-value < 2.2e-16
#Test for homogeneity of covariance matrices
#install.packages("heplots")
library(heplots)
## Loading required package: car
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
boxm_test <- train_na_as_cat %>% select(continuous.predictors,income)
#boxm_test$income <- as.numeric(as.factor(boxm_test$income))
boxM <- boxM(boxm_test[, c(1:5)], boxm_test$income)
boxM
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: boxm_test[, c(1:5)]
## Chi-Sq (approx.) = 76481, df = 15, p-value < 2.2e-16
plot(boxM)
#BoxM null rejected, covariance not same across each level, use QDA instead of LDA
standardize = preProcess(train_na_as_cat, method = c("center", "scale"))
standardized.train_na_as_cat = predict(standardize, train_na_as_cat)
standardized.test_na_as_cat = predict(standardize, test_na_as_cat)
multivariate_sample2 <- train_na_as_cat %>% select(age)
multivariate_sample2 <- multivariate_sample2[sample(1:nrow(multivariate_sample2), 1000), ]
mvShapiro.Test(as.matrix(as.matrix(multivariate_sample2)))
##
## Generalized Shapiro-Wilk test for Multivariate Normality by
## Villasenor-Alva and Gonzalez-Estrada
##
## data: as.matrix(as.matrix(multivariate_sample2))
## MVW = 0.96504, p-value = 9.275e-15
#Training Set
dat.train.x <- standardized.train_na_as_cat %>% select(continuous.predictors,"income")
dat.train.x$income <- as.factor(dat.train.x$income)
dat.train.y <- standardized.train_na_as_cat$income
dat.train.y <- as.factor(as.character(dat.train.y))
#Check distributions after standardizing
hist.data.frame(dat.train.x, nclass=20)
#Boxplot capital gain/loss to check for outliers
boxplot(dat.train.x$capital_gain)
boxplot(dat.train.x$capital_loss)
fit.lda <- qda(income ~ ., data = dat.train.x, prior = c(0.76,0.24))
pred.lda <- predict(fit.lda, newdata = dat.train.x)
preds <- pred.lda$posterior
preds <- as.data.frame(preds)
pred <- prediction(preds[,2],dat.train.y)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
plot(roc.perf)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Valid set I
dat.val1.x <- standardized.test_na_as_cat %>% select(continuous.predictors)
dat.val1.y <- standardized.test_na_as_cat$income
dat.val1.y <- as.factor(as.character(dat.val1.y))
pred.lda1 <- predict(fit.lda, newdata = dat.val1.x)
preds1 <- pred.lda1$posterior
preds1 <- as.data.frame(preds1)
pred1 <- prediction(preds1[,2],dat.val1.y)
roc.perf = performance(pred1, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred1, measure = "auc")
auc.train <- auc.train@y.values
plot(roc.perf)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Re-run with priors info
summary(dat.train.y)
## <=50K >50K
## 19799 6250
# <=50K = 19799 / 26049 = 0.76
# >50K = 6250 / 26049 = 0.24
#Start here to do LDA ROC plot with best
qda.roc <- roc(predictor = preds1[,1], response = dat.val1.y, levels = (levels(dat.val1.y)))
## Setting direction: controls > cases
plot(qda.roc,print.thres="best",ylim=c(0,1),main="QDA")
text(x = .40, y = .6,paste("AUC-test = ", round(auc.train[[1]],3), sep = ""))
threshold=0.988
log.predclass<-ifelse(preds1[,1]>threshold,"<=50K",">50K")
log.predclass<-factor(log.predclass)
confusionMatrix(log.predclass,dat.val1.y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 3452 340
## >50K 1469 1251
##
## Accuracy : 0.7222
## 95% CI : (0.7112, 0.7331)
## No Information Rate : 0.7557
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3933
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7015
## Specificity : 0.7863
## Pos Pred Value : 0.9103
## Neg Pred Value : 0.4599
## Prevalence : 0.7557
## Detection Rate : 0.5301
## Detection Prevalence : 0.5823
## Balanced Accuracy : 0.7439
##
## 'Positive' Class : <=50K
##
Random Forest
rf.predictors = c("age","workclass","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country")
#One more time with a Random Forest
#I'm removing the fluff variables just to make coding easier.
dat.train.rf <- train_na_as_cat %>% select(income, rf.predictors, native_country)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(rf.predictors)` instead of `rf.predictors` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dat.train.rf <- mutate_if(dat.train.rf, is.character, ~ as.factor(as.factor(.x)))
str(dat.train.rf)
## 'data.frame': 26049 obs. of 13 variables:
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 2 1 1 1 1 1 1 1 1 ...
## $ age : int 17 34 24 67 25 24 65 44 45 23 ...
## $ workclass : Factor w/ 9 levels "Federal-gov",..: 5 5 5 4 5 5 5 5 5 5 ...
## $ education_num : int 7 9 10 9 8 13 5 10 9 9 ...
## $ marital_status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 5 1 5 5 7 3 5 3 ...
## $ occupation : Factor w/ 15 levels "Adm-clerical",..: 13 3 3 8 3 11 10 7 9 1 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 5 4 4 2 5 1 5 6 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 1 5 5 5 3 5 3 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 1 2 1 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 20 55 40 20 40 39 24 40 40 40 ...
## $ native_country: Factor w/ 42 levels "Cambodia","Canada",..: 40 40 26 40 40 40 40 40 40 26 ...
train.rf<-randomForest(income~.,data=dat.train.rf,mtry=4,ntree=500,importance=T)
fit.pred<-predict(train.rf,newdata=dat.train.rf,type="prob")
pred <- prediction(fit.pred[,2], dat.train.rf$income)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
plot(roc.perf)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Tune the mtry parameter, mtry=2 has lowest OOB
mtry <- tuneRF(dat.train.rf %>% select(-income),dat.train.rf$income, ntreeTry=500,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
## mtry = 3 OOB error = 13.57%
## Searching left ...
## mtry = 2 OOB error = 13.46%
## 0.007640068 0.01
## Searching right ...
## mtry = 4 OOB error = 14.05%
## -0.03593662 0.01
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
## mtry OOBError
## 2.OOB 2 0.1346309
## 3.OOB 3 0.1356674
## 4.OOB 4 0.1405428
print(best.m)
## [1] 2
##### FINAL MODEL ######
train.rf<-randomForest(income~.,data=dat.train.rf,mtry=2,ntree=500,importance=T)
fit.pred<-predict(train.rf,newdata=dat.train.rf,type="prob")
varImpPlot(train.rf)
pred <- prediction(fit.pred[,2], dat.train.rf$income)
roc.perf = performance(pred, measure = "tpr", x.measure = "fpr")
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
plot(roc.perf)
abline(a=0, b= 1)
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
#Predict Validation Set I
dat.val1.rf <- test_na_as_cat %>% select(income, rf.predictors, native_country)
dat.val1.rf <- mutate_if(dat.val1.rf, is.character, ~ as.factor(as.factor(.x)))
#Make test set factor levels match training used in model so predictions work
levels(dat.val1.rf$native_country) <- levels(dat.train.rf$native_country)
str(dat.train.rf)
## 'data.frame': 26049 obs. of 13 variables:
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 2 1 1 1 1 1 1 1 1 ...
## $ age : int 17 34 24 67 25 24 65 44 45 23 ...
## $ workclass : Factor w/ 9 levels "Federal-gov",..: 5 5 5 4 5 5 5 5 5 5 ...
## $ education_num : int 7 9 10 9 8 13 5 10 9 9 ...
## $ marital_status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 5 1 5 5 7 3 5 3 ...
## $ occupation : Factor w/ 15 levels "Adm-clerical",..: 13 3 3 8 3 11 10 7 9 1 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 4 1 5 4 4 2 5 1 5 6 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 1 5 5 5 3 5 3 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 1 2 1 1 ...
## $ capital_gain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 20 55 40 20 40 39 24 40 40 40 ...
## $ native_country: Factor w/ 42 levels "Cambodia","Canada",..: 40 40 26 40 40 40 40 40 40 26 ...
str(dat.val1.rf)
## 'data.frame': 6512 obs. of 13 variables:
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 2 2 2 2 1 1 2 1 ...
## $ age : int 53 28 52 31 37 30 25 38 40 19 ...
## $ workclass : Factor w/ 9 levels "Federal-gov",..: 5 5 7 5 5 8 7 5 5 5 ...
## $ education_num : int 7 13 9 14 10 13 9 7 16 9 ...
## $ marital_status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 3 3 3 5 3 3 5 3 3 5 ...
## $ occupation : Factor w/ 15 levels "Adm-clerical",..: 6 11 4 11 4 11 5 13 11 3 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 1 6 1 2 1 1 4 1 1 4 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 3 3 5 5 3 2 5 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 1 2 1 2 2 2 2 2 2 ...
## $ capital_gain : int 0 0 0 14084 0 0 0 0 0 0 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 40 40 45 50 80 40 35 50 60 40 ...
## $ native_country: Factor w/ 42 levels "Cambodia","Canada",..: 38 5 38 38 38 18 38 38 38 38 ...
pred.val1<-predict(train.rf,newdata=dat.val1.rf,type="prob")
pred <- prediction(pred.val1[,2], dat.val1.rf$income)
RF.roc = roc(response=dat.val1.rf$income,predictor=pred.val1[,2],levels=c("<=50K",">50K"))
## Setting direction: controls < cases
auc.train <- performance(pred, measure = "auc")
auc.train <- auc.train@y.values
plot(RF.roc,print.thres="best", main="Random Forest")
text(x = .40, y = .6,paste("AUC = ", round(auc.train[[1]],3), sep = ""))
threshold=0.367
log.predclass<-ifelse(pred.val1[,2]>threshold,">50K","<=50K")
log.predclass<-factor(log.predclass)
confusionMatrix(log.predclass,dat.val1.rf$income)
## Confusion Matrix and Statistics
##
## Reference
## Prediction <=50K >50K
## <=50K 3875 201
## >50K 1046 1390
##
## Accuracy : 0.8085
## 95% CI : (0.7987, 0.818)
## No Information Rate : 0.7557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5604
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7874
## Specificity : 0.8737
## Pos Pred Value : 0.9507
## Neg Pred Value : 0.5706
## Prevalence : 0.7557
## Detection Rate : 0.5951
## Detection Prevalence : 0.6259
## Balanced Accuracy : 0.8306
##
## 'Positive' Class : <=50K
##
Overlay ROC plots
roclist <- list("Base" = base.roc,"Simple" = simple.roc,"Complex" = complex.roc, "QDA" = qda.roc, "Random Forest" = RF.roc)
ggroc(roclist, legacy.axes = FALSE) + scale_colour_manual(values = c("black","green", "blue", "red", "orange")) + ggtitle("ROC Comparison by Model") + labs(x = "Specificity", y = "Sensitivity") + theme(legend.title=element_blank())+ coord_fixed() + theme(plot.title = element_text(hjust = 0.5))
ggroc(roclist, legacy.axes = TRUE) + scale_colour_manual(values = c("black","green", "blue", "red", "orange")) + geom_abline(linetype = "dashed", size=0.2) + ggtitle("ROC Comparison by Model") + labs(x = "1 - Specificity", y = "Sensitivity") + theme(legend.title=element_blank())+ coord_fixed() + theme(plot.title = element_text(hjust = 0.5))